In [1]:
import os,sys,string,re
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt

to_space= '''&,()+'"/'''
trans=string.maketrans(to_space, ' '*len(to_space))
ps = PorterStemmer()

os.chdir('/data/arpit.goel/31_PricePrediction/03.Submissions')
pd.set_option('display.float_format', lambda x: '%.1f' % x)

df_raw_train=pd.read_csv('../input/train.tsv',delimiter='\t')
df_raw_test=pd.read_csv('../input/test.tsv',delimiter='\t')
df_raw_train['log_price']=np.clip(np.log(1+df_raw_train['price']),1,5)

ins=df_raw_train.sample(frac=0.8,random_state=200)
oos=df_raw_train.drop(ins.index)
oot=df_raw_test

In [2]:
def clean_category(x):
    if len(x)==0:
        return ''
    x=re.sub(chr(195),'e',x)
    x=re.sub(chr(169),'',x)
    x=x.translate(trans)
    x=re.sub(' +',' ',x)
    x=x.lower()
    x=' '.join(list(set(map(ps.stem,x.split(' ')))))
    x=re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '',x)
    return x

clean_brand=lambda x: re.sub('[^0-9a-z]','',x.lower())
cat_lookup=ins.groupby('category_name')['log_price'].apply(lambda x: {'log_price':x.sum(),'count':x.size}).unstack()
cat_lookup['category_name']=cat_lookup.index
cat_lookup['category0']=cat_lookup['category_name'].map(lambda x: str(x).split('/')[0]).map(clean_category)
cat_lookup['category1']=cat_lookup['category_name'].map(lambda x: str(x).split('/')[1] if str(x).count('/')>=1 else 'na').map(clean_category)
cat_lookup['category2']=cat_lookup['category_name'].map(lambda x: str(x).split('/')[2] if str(x).count('/')>=2 else 'na').map(clean_category)    

def clean_cat_brand(df):
    df['category0']=df['category_name'].astype(np.str).map(cat_lookup['category0']).fillna('nan')
    df['category1']=df['category_name'].astype(np.str).map(cat_lookup['category1']).fillna('nan')
    df['category2']=df['category_name'].astype(np.str).map(cat_lookup['category2']).fillna('nan')    
    df['brand']=df['brand_name'].fillna('nan').astype(np.str).map(clean_brand)

for data in [ins,oos,oot]:
    clean_cat_brand(data)

In [3]:
def get_group_lookup(df,cols,label,threshold):
    interaction_num=len(cols)
    df_group=df.groupby(cols)['log_price'].apply(lambda x: {'count':x.size,'sum_log_price':x.sum()}).unstack().reset_index()
    df_group['new_grp']=np.where(df_group['count']>threshold,df_group.index,-999)
    df_index_group=df_group.groupby('new_grp')[['count','sum_log_price']].sum()
    df_index_group['freq']=df_index_group['count']/df_index_group['count'].sum()
    df_index_group['avg_log_price']=df_index_group['sum_log_price']/df_index_group['count']
    df_group['sig%d_%d_freq'%(len(cols),label)]=df_group['new_grp'].map(df_index_group['freq'])
    df_group['sig%d_%d_price'%(len(cols),label)]=df_group['new_grp'].map(df_index_group['avg_log_price'])
    df_group=df_group[df_group['new_grp']>=0]
    output_sigs=[x for x in df_group.columns if 'sig' in x]
    missing_vals=[0,df['log_price'].mean()] if -999 not in df_index_group.index else list(df_index_group.ix[-999,['freq','avg_log_price']])
    missing_vals=pd.Series(missing_vals,index=output_sigs)
    return df_group[cols+output_sigs],missing_vals
  
#get_group_lookup(ins,['category0'],3,500)
interaction_columns=['category0','category1','category2','brand']
from itertools import combinations
brand_cat_space=[list(x) for i in range(len(interaction_columns)) for x in combinations(interaction_columns,i+1)]

group_lookups={}
for i,x in enumerate(brand_cat_space):
    group_lookups[tuple(x)]=get_group_lookup(ins,x,i,50)

In [13]:
def get_cat_brand_vars(df):
    df_out=df.copy()
    for i,x in group_lookups.items():
        df_out=pd.merge(df_out,x[0],on=list(i),how='outer')
        df_out=df_out.fillna(x[1])
    for i in range(len(interaction_columns)):
        for signal in ['freq','price']:
            signals=[x for x in df_out.columns if 'sig%d'%(i+1) in x and signal in x]
            df_out['sig%d_max_%s'%(i+1,signal)]=df_out[signals].max(axis=1)
            df_out['sig%d_min_%s'%(i+1,signal)]=df_out[signals].min(axis=1)
            df_out['sig%d_avg_%s'%(i+1,signal)]=df_out[signals].mean(axis=1)
    df_out['cost_in_description']=df_out['item_description'].astype(np.str).map(lambda x: 1 if '[rm]' in x else 0)
    return df_out

ins=get_cat_brand_vars(ins)
oos=get_cat_brand_vars(oos)
oot=get_cat_brand_vars(oot)

In [60]:
def get_free_text_vars(df,var,label):
    df['ft_0_%s'%label]=df[var].astype(np.str).str.len()                                                             # Num characters
    df['ft_1_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[%s]'%string.printable,'',x)).str.len()         # Num non printable chars
    df['ft_2_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[^#$%&*:;<=>?@\^_`|~]','',x)).str.len()         # Num Special Characters
    df['ft_3_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[^0-9]','',x)).str.len()                        # Num numerals
    df['ft_4_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[^a-z]','',x)).str.len()                        # Num lower case
    df['ft_5_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[^A-Z]','',x)).str.len()                        # Num upper case
    df['ft_6_%s'%label]=df[var].astype(np.str).map(lambda x: re.sub('[^ ]','',x)).str.len()                          # Num spaces
    df['%s_clean'%label]=df[var].astype(np.str).str.lower().map(lambda x: re.sub('[^ a-z]',' ',x))                 # Clean Text
    
for data in [ins,oos,oot]:
    for var,label in [('name','name'),('item_description','desc')]:
        get_free_text_vars(data,var,label)


KeyboardInterrupt: 

In [14]:
ins.columns

Index([u'train_id', u'name', u'item_condition_id', u'category_name',
       u'brand_name', u'price', u'shipping', u'item_description', u'log_price',
       u'category0', u'category1', u'category2', u'brand', u'ft_0_name',
       u'ft_1_name', u'ft_2_name', u'ft_3_name', u'ft_4_name', u'ft_5_name',
       u'ft_6_name', u'name_clean', u'ft_0_desc', u'ft_1_desc', u'ft_2_desc',
       u'ft_3_desc', u'ft_4_desc', u'ft_5_desc', u'ft_6_desc', u'desc_clean',
       u'sig3_10_freq', u'sig3_10_price', u'sig3_11_freq', u'sig3_11_price',
       u'sig2_5_freq', u'sig2_5_price', u'sig1_0_freq', u'sig1_0_price',
       u'sig4_14_freq', u'sig4_14_price', u'sig2_9_freq', u'sig2_9_price',
       u'sig1_2_freq', u'sig1_2_price', u'sig3_13_freq', u'sig3_13_price',
       u'sig3_12_freq', u'sig3_12_price', u'sig1_3_freq', u'sig1_3_price',
       u'sig1_1_freq', u'sig1_1_price', u'sig2_8_freq', u'sig2_8_price',
       u'sig2_6_freq', u'sig2_6_price', u'sig2_4_freq', u'sig2_4_price',
       u'sig2_7_freq', u

In [17]:
sigs=[x for x in ins.columns if x[0:3] in ['sig','ft_']]+['shipping','item_condition_id','cost_in_description']
sigs1=['sig2_avg_price','sig2_max_price','shipping','sig2_min_price','item_condition_id','sig3_max_price',\
      'sig1_min_price','sig1_3_price','sig3_min_price','sig3_avg_price','sig2_9_price','sig1_avg_price',\
      'sig1_avg_freq','sig2_7_price','sig1_max_price','sig2_6_price','sig2_avg_freq','sig3_13_price','sig1_0_price']
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(n_estimators=5,max_depth=20,min_samples_leaf=50,n_jobs=-1,oob_score=True,verbose=1)
rfr.fit(ins[sigs],ins['log_price'])
print r2_score(ins['log_price'],rfr.predict(ins[sigs]))
print r2_score(oos['log_price'],rfr.predict(oos[sigs]))
pd.set_option('display.float_format', lambda x: '%.3f' % x)

cols=pd.Series(rfr.feature_importances_,index=sigs).sort_values(ascending=False).cumsum()
#cols=cols[cols<0.95].index.tolist()
print cols

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.1s finished
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.6s finished


0.525630638273
0.485406513809
sig3_avg_price        0.482
sig4_avg_price        0.559
shipping              0.613
sig4_max_price        0.657
sig4_min_price        0.698
item_condition_id     0.739
sig3_12_price         0.760
ft_3_desc             0.782
sig3_min_price        0.798
ft_5_desc             0.810
ft_0_desc             0.821
sig2_min_price        0.832
sig1_3_price          0.842
ft_0_name             0.852
ft_3_name             0.860
ft_4_desc             0.868
sig4_14_price         0.875
ft_5_name             0.881
ft_4_name             0.887
sig2_9_price          0.892
ft_6_desc             0.897
sig1_avg_freq         0.903
sig3_avg_freq         0.908
sig1_max_price        0.912
sig1_avg_price        0.917
sig3_13_price         0.921
sig1_min_price        0.926
sig2_avg_freq         0.930
ft_6_name             0.934
cost_in_description   0.937
                       ... 
sig1_1_price          0.967
sig2_9_freq           0.969
sig2_5_freq           0.971
sig2_7_price      

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.1s finished


In [85]:
rfr=RandomForestRegressor(n_estimators=5,max_depth=25,min_samples_leaf=250,n_jobs=-1,oob_score=True)
rfr.fit(ins_interaction[cols],ins_interaction['log_price'])
print r2_score(ins_interaction['log_price'],rfr.predict(ins_interaction[cols]))
print r2_score(oos_interaction['log_price'],rfr.predict(oos_interaction[cols]))
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.Series(rfr.feature_importances_,index=cols).sort_values(ascending=False).cumsum()


0.442111316663
0.433145192401


sig2_avg_price         0.561
shipping               0.645
sig2_min_price         0.699
item_condition_id      0.749
sig2_max_price         0.791
sig3_max_price         0.820
chars_in_description   0.846
sig1_3_price           0.867
sig1_min_price         0.884
sig3_min_price         0.898
sig2_9_price           0.910
sig2_7_price           0.922
sig3_13_price          0.934
chars_in_name          0.944
sig3_avg_price         0.954
sig1_avg_price         0.964
sig2_avg_freq          0.972
sig1_avg_freq          0.981
sig1_max_price         0.989
sig2_6_price           0.995
words_in_description   1.000
dtype: float64

In [86]:
sigs=[x for x in ins_interaction.columns if 'sig' in x]+['shipping','item_condition_id','words_in_description',\
        'chars_in_description','words_in_name','chars_in_name','cost_in_description']
from sklearn.linear_model import Lasso
pd.set_option('display.float_format', lambda x: '%.4f' % x)

performance=[]
for alpha in [0,0.001,0.003,0.01,0.03,0.1,0.3,1]:
    reg=Lasso(alpha=alpha)
    reg.fit(ins_interaction[sigs],ins_interaction['log_price'])
    performance.append([alpha,reg.score(ins_interaction[sigs],ins_interaction['log_price']), reg.score(oos_interaction[sigs],oos_interaction['log_price'])])
    
pd.DataFrame(performance,columns=['alpha','train','test'])

  positive)


Unnamed: 0,alpha,train,test
0,0.0,0.3702,0.3705
1,0.001,0.3674,0.3677
2,0.003,0.3641,0.3645
3,0.01,0.3586,0.359
4,0.03,0.3257,0.3262
5,0.1,0.1453,0.1455
6,0.3,0.0007,0.0006
7,1.0,0.0007,0.0006


In [59]:
sigs=[x for x in ins.columns if x[0:3] in ['sig','ft_']]+['shipping','item_condition_id','cost_in_description']
from sklearn.metrics import r2_score,mean_squared_error

from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(n_estimators=5,max_depth=15,min_samples_leaf=250,n_jobs=-1,oob_score=True,verbose=1)
rfr.fit(ins[sigs],ins['log_price'])

print (mean_squared_error(ins['log_price'],rfr.predict(ins[sigs])))
print (mean_squared_error(oos['log_price'],rfr.predict(oos[sigs])))


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   24.8s finished
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.2s finished


0.275489629663
0.281164015559


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.1s finished


In [70]:
from collections import Counter
all_words=pd.concat([ins['name_clean'],ins['desc_clean']]).values
all_words=[y for x in all_words for y in x.split()]
all_words=pd.Series(Counter(all_words))
all_words=all_words[all_words>100]

def stem_word(x):
    try:
        return ps.stem(x)
    except:
        return x

stemmed_words=all_words.reset_index()
stemmed_words.columns=['word','count']
stemmed_words['stem']=stemmed_words['word'].map(stem_word)
stemmed_words.index=stemmed_words['word']
vocabulary=stemmed_words.groupby('stem')['count'].sum()
vocabulary=vocabulary[vocabulary>=100].index.tolist()
keywords_raw=set(stemmed_words['word'])
keywords_lookup=stemmed_words['stem'].to_dict()

In [73]:
def clean_text(df):
    df['free_text']=df['name_clean']+' '+df['desc_clean']
    df['free_text']=df['free_text'].map(lambda x: ' '.join([keywords_lookup[y] for y in x.split(' ') if y in keywords_raw]))
    
for data in [ins,oos,oot]:
    clean_text(data)

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(vocabulary=vocabulary)
ft_ins = vectorizer.fit_transform(ins['free_text'])
ft_oos = vectorizer.transform(oos['free_text'])

X_ins=np.vstack(ins[sigs],ft_ins)
X_oos=np.vstack(oos[sigs],ft_oos)



TypeError: vstack() takes exactly 1 argument (2 given)

In [96]:
from scipy.sparse import csr_matrix, hstack
X_ins=hstack((csr_matrix(ins[sigs]),np.clip(ft_ins,0,5)))
X_oos=hstack((csr_matrix(oos[sigs]),np.clip(ft_oos,0,5)))
print (ft_ins.shape)
print (ins[sigs].shape)

(1186028, 8002)
(1186028, 71)


In [None]:
X_ins.shape


In [98]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=0.01,solver="sag")
model.fit(X_ins[:10000],ins['log_price'][:10000])

print (mean_squared_error(ins['log_price'],model.predict(X_ins)))
print (mean_squared_error(oos['log_price'],model.predict(X_oos)))


KeyboardInterrupt: 