In [4]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from nltk.corpus import stopwords 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import gc
from sklearn.model_selection import train_test_split
import xgboost as xgb
from scipy.sparse import hstack, csr_matrix



In [5]:
## reading train and test data 
traindata = pd.read_csv('train.csv', index_col = "item_id", parse_dates = ["activation_date"])
traindata_index = traindata.index
testdata = pd.read_csv('test.csv', index_col = "item_id", parse_dates = ["activation_date"])
testdata_index = testdata.index

In [6]:
train_output = traindata.deal_probability.copy().clip(0.0, 1.0)

In [7]:
total_data = pd.concat([traindata,testdata],axis=0)

In [8]:
del traindata,testdata
gc.collect()

14

In [10]:
## setting dates 
total_data["Weekday"] = total_data['activation_date'].dt.weekday
total_data["Weekend_no"] = total_data['activation_date'].dt.week
total_data["day"] = total_data['activation_date'].dt.day

In [11]:
## encoding 1 if image present 0 if not present 
total_data['image'].fillna(0, inplace=True)
total_data['image_status'] = np.where(total_data['image']==0, 0, 1)



In [12]:
## dropping date and image column 
total_data.drop(["activation_date","image"],axis=1,inplace=True)

In [30]:
total_data.head()

Unnamed: 0_level_0,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_id,user_type,Weekday,Weekend_no,day,image_status
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
b912c3c6a6ad,Товары для детей и игрушки,Екатеринбург,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",1008.0,2,Постельные принадлежности,,,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),e00f8ff2eaf9,Private,1,13,28,1
2dac0150717d,Мебель и интерьер,Самара,0.0,"Стойка для одежды, под вешалки. С бутика.",692.0,19,Другое,,,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,39aeb48f0017,Private,6,12,26,1
ba83aefab5dc,Аудио и видео,Ростов-на-Дону,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",3032.0,9,"Видео, DVD и Blu-ray плееры",,,Бытовая электроника,4000.0,Ростовская область,Philips bluray,91e2f88dd6e3,Private,0,12,20,1
02996f1dd2ea,Товары для детей и игрушки,Набережные Челны,0.80323,Продам кресло от0-25кг,796.0,286,Автомобильные кресла,,,Личные вещи,2200.0,Татарстан,Автокресло,bf5cccea572d,Company,5,12,25,1
7c90be56d2ab,Автомобили,Волгоград,0.20797,Все вопросы по телефону.,2264.0,3,С пробегом,ВАЗ (LADA),2110.0,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",ef50846afc0b,Private,3,11,16,1


In [13]:
## assigning categorical variables to most of the data columns except description and title 
categorical = ["user_id","region","city","parent_category_name","category_name","user_type",
               "image_top_1","param_1","param_2","param_3"]
from sklearn import preprocessing
xgbencoder = preprocessing.LabelEncoder()
for col in categorical:
    total_data[col] = xgbencoder.fit_transform(total_data[col].astype(str))

In [32]:
total_data.head()

Unnamed: 0_level_0,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_id,user_type,Weekday,Weekend_no,day,image_status
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
b912c3c6a6ad,42,462,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",12,2,249,112,1217,4,400.0,19,Кокоби(кокон для сна),884270,1,1,13,28,1
2dac0150717d,22,1314,0.0,"Стойка для одежды, под вешалки. С бутика.",2722,19,122,112,1217,2,3000.0,17,Стойка для Одежды,227908,1,6,12,26,1
ba83aefab5dc,2,1290,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",2259,9,84,112,1217,0,4000.0,16,Philips bluray,576261,1,0,12,20,1
02996f1dd2ea,42,950,0.80323,Продам кресло от0-25кг,2837,286,38,112,1217,4,2200.0,21,Автокресло,755087,0,5,12,25,1
7c90be56d2ab,0,318,0.20797,Все вопросы по телефону.,1407,3,278,124,46,6,40000.0,4,"ВАЗ 2110, 2003",944363,1,3,11,16,1


In [14]:
total_data['no_of_words_title'] = total_data['title'].apply(lambda x : len(str(x).split()))

In [15]:
total_data['no_of_words_description'] = total_data['description'].apply(lambda x : len(str(x).split()))

In [42]:
total_data.head()

Unnamed: 0_level_0,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,...,region,title,user_id,user_type,Weekday,Weekend_no,day,image_status,no_of_words_description,no_of_words_title
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,42,462,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",12,2,249,112,1217,4,...,19,Кокоби(кокон для сна),884270,1,1,13,28,1,7,3
2dac0150717d,22,1314,0.0,"Стойка для одежды, под вешалки. С бутика.",2722,19,122,112,1217,2,...,17,Стойка для Одежды,227908,1,6,12,26,1,7,3
ba83aefab5dc,2,1290,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",2259,9,84,112,1217,0,...,16,Philips bluray,576261,1,0,12,20,1,17,2
02996f1dd2ea,42,950,0.80323,Продам кресло от0-25кг,2837,286,38,112,1217,4,...,21,Автокресло,755087,0,5,12,25,1,3,1
7c90be56d2ab,0,318,0.20797,Все вопросы по телефону.,1407,3,278,124,46,6,...,4,"ВАЗ 2110, 2003",944363,1,3,11,16,1,4,3


In [16]:
total_data['no_of_chars_title'] = total_data['title'].apply(lambda x : len(str(x)))

In [17]:
total_data['no_of_chars_description'] = total_data['description'].apply(lambda x : len(str(x)))

In [21]:
total_data.head(5)

Unnamed: 0_level_0,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,...,user_id,user_type,Weekday,Weekend_no,day,image_status,no_of_words_title,no_of_words_description,no_of_chars_title,no_of_chars_description
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,42,462,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",12,2,249,112,1217,4,...,884270,1,1,13,28,1,3,7,38,108
2dac0150717d,22,1314,0.0,"Стойка для одежды, под вешалки. С бутика.",2722,19,122,112,1217,2,...,227908,1,6,12,26,1,3,7,32,73
ba83aefab5dc,2,1290,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",2259,9,84,112,1217,0,...,576261,1,0,12,20,1,2,17,14,168
02996f1dd2ea,42,950,0.80323,Продам кресло от0-25кг,2837,286,38,112,1217,4,...,755087,0,5,12,25,1,1,3,20,38
7c90be56d2ab,0,318,0.20797,Все вопросы по телефону.,1407,3,278,124,46,6,...,944363,1,3,11,16,1,3,4,17,44


In [22]:
total_data.columns

Index([u'category_name', u'city', u'deal_probability', u'description',
       u'image_top_1', u'item_seq_number', u'param_1', u'param_2', u'param_3',
       u'parent_category_name', u'price', u'region', u'title', u'user_id',
       u'user_type', u'Weekday', u'Weekend_no', u'day', u'image_status',
       u'no_of_words_title', u'no_of_words_description', u'no_of_chars_title',
       u'no_of_chars_description'],
      dtype='object')

In [None]:
# error wont come in sequence 
notext = total_data[['category_name','city','image_top_1','item_seq_number','param_1','param_2','param_3','parent_category_name','price','region','title','user_id','user_type','Weekday','Weekend_no','day','image_status','no_of_words_description','no_of_words_title','no_of_chars_title','no_of_chars_description','deal_probability']]        

In [24]:
notext['price'].fillna(0, inplace=True)


In [25]:
#columnname = notext.columns 
#for i in columnname:
#    if 
#    print notext[i].isnull().values.any()

In [26]:
X = notext.loc[traindata_index,:]

In [27]:
## saving train data which dosent have any text processing like hashing or TIFDR 
X.to_csv('./processeddata/traindata_no_text_processing.csv',header=True)

In [28]:
y = notext.loc[testdata_index,:]
y.head()

Unnamed: 0_level_0,category_name,city,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,...,user_type,Weekday,Weekend_no,day,image_status,no_of_words_description,no_of_words_title,no_of_chars_title,no_of_chars_description,deal_probability
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6544e41a8817,10,318,1137,66,110,198,74,4,0.0,4,...,1,1,16,18,1,2,2,29,23,
65b9484d670f,5,993,3063,4,119,112,1217,8,3000.0,19,...,1,6,15,16,0,10,2,31,154,
8bab230b2ecd,2,151,2178,15,318,112,1217,0,15000.0,12,...,1,0,16,17,1,21,1,3,233,
8e348601fefc,4,1319,3063,70,108,131,1217,2,4500.0,18,...,1,0,16,17,0,12,3,24,155,
8bd2fe400b89,42,243,6,15,102,112,1217,4,4900.0,14,...,1,5,15,15,1,10,2,32,106,


In [29]:
y = y.iloc[:,0:y.shape[1]-1]
y.head()

Unnamed: 0_level_0,category_name,city,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,...,user_id,user_type,Weekday,Weekend_no,day,image_status,no_of_words_description,no_of_words_title,no_of_chars_title,no_of_chars_description
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6544e41a8817,10,318,1137,66,110,198,74,4,0.0,4,...,867817,1,1,16,18,1,2,2,29,23
65b9484d670f,5,993,3063,4,119,112,1217,8,3000.0,19,...,181851,1,6,15,16,0,10,2,31,154
8bab230b2ecd,2,151,2178,15,318,112,1217,0,15000.0,12,...,45341,1,0,16,17,1,21,1,3,233
8e348601fefc,4,1319,3063,70,108,131,1217,2,4500.0,18,...,375979,1,0,16,17,0,12,3,24,155
8bd2fe400b89,42,243,6,15,102,112,1217,4,4900.0,14,...,141497,1,5,15,15,1,10,2,32,106


In [30]:
y.to_csv('./processeddata/testdata_no_text_processing.csv',header=True)

In [33]:
del X,y,notext
gc.collect()

120

In [34]:
## now making features using Term Frequency Inverse Document Frequency Stage 
print("Term Frequency Inverse Document Frequency Stage")

Term Frequency Inverse Document Frequency Stage


In [35]:
## for text features 
tfeatures = ["description", "title"]

for cols in tfeatures:
    total_data[cols] = total_data[cols].astype(str) 
    total_data[cols] = total_data[cols].astype(str).fillna('missing') # FILL NA



In [36]:
russian_stop = set(stopwords.words('russian'))



In [37]:
## first on title 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=russian_stop,
                                   use_idf=False, 
                                   norm='l1',
                                   analyzer =  'word',
                                   sublinear_tf =  True,
                                    dtype =  np.float32,
                                    smooth_idf  = False,
                                    ngram_range=(1, 2),
                                    max_features=5000,
)

In [38]:
%%time
col_title = tfidf_vectorizer.fit(total_data['title'])

CPU times: user 50.2 s, sys: 2.19 s, total: 52.4 s
Wall time: 51.4 s


In [39]:
print "Number of stopwords chosen in title is",len(col_title.stop_words)

Number of stopwords chosen in title is 151


In [40]:
%%time 
col_title_values = tfidf_vectorizer.transform(total_data['title'])

CPU times: user 41.8 s, sys: 1.61 s, total: 43.5 s
Wall time: 42.5 s


In [41]:
#print col_title_values

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer2 = TfidfVectorizer(stop_words=russian_stop,
                                   use_idf=False, 
                                   norm='l1',
                                   analyzer =  'word',
                                   sublinear_tf =  True,
                                    dtype =  np.float32,
                                    smooth_idf  = False,
                                    ngram_range=(1, 2),
                                    max_features=5000,
)

In [44]:
%%time
col_description = tfidf_vectorizer2.fit(total_data['description'])

CPU times: user 5min 20s, sys: 2min 6s, total: 7min 27s
Wall time: 9min 34s


In [45]:
%%time 
col_description_values = tfidf_vectorizer2.transform(total_data['description'])

CPU times: user 3min 1s, sys: 25.5 s, total: 3min 27s
Wall time: 5min 36s


In [46]:
total_data.drop(tfeatures, axis=1,inplace=True)

In [47]:
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df




In [48]:
totaldata = reduce_mem_usage(total_data)


Memory usage of dataframe is 337.00 MB
Memory usage after optimization is: 88.00 MB
Decreased by 73.0%


In [58]:
totaldata.drop(['deal_probability'], axis = 1, inplace = True )

In [59]:
totaldata.head()

Unnamed: 0_level_0,category_name,city,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,user_id,user_type,Weekday,Weekend_no,day,image_status,no_of_words_title,no_of_words_description,no_of_chars_title,no_of_chars_description
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
b912c3c6a6ad,42,462,12,2,249,112,1217,4,400.0,19,884270,1,1,13,28,1,3,7,38,108
2dac0150717d,22,1314,2722,19,122,112,1217,2,3000.0,17,227908,1,6,12,26,1,3,7,32,73
ba83aefab5dc,2,1290,2259,9,84,112,1217,0,4000.0,16,576261,1,0,12,20,1,2,17,14,168
02996f1dd2ea,42,950,2837,286,38,112,1217,4,2200.0,21,755087,0,5,12,25,1,1,3,20,38
7c90be56d2ab,0,318,1407,3,278,124,46,6,40000.0,4,944363,1,3,11,16,1,3,4,17,44


In [60]:
%%time 
alltrain = hstack([csr_matrix(totaldata.loc[traindata_index,:].values),col_title_values[0:traindata_index.shape[0]],col_description_values[0:traindata_index.shape[0]]]) # Sparse Matrix


CPU times: user 4.74 s, sys: 6.42 s, total: 11.2 s
Wall time: 33.7 s


In [62]:
%%time
alltest = hstack([csr_matrix(totaldata.loc[testdata_index,:].values),col_title_values[0:testdata_index.shape[0]],col_description_values[0:testdata_index.shape[0]]]) # Sparse Matrix



CPU times: user 1.69 s, sys: 978 ms, total: 2.67 s
Wall time: 3.7 s


In [69]:
%%time
import pickle
file = open('./processeddata/perfect_train_5000.pkl','wb')
pickle.dump(alltrain,file )
file.close()
# to read them back 
# alltrain = pickle.load( open( "perfecttrain.pkl", "rb" ) )

CPU times: user 1min 14s, sys: 17 s, total: 1min 31s
Wall time: 4min 8s


In [70]:
%%time
import pickle
file = open('./processeddata/perfect_test_5000.pkl','wb')
pickle.dump(alltest, file)
file.close()



CPU times: user 24.9 s, sys: 2.88 s, total: 27.8 s
Wall time: 1min 9s


In [72]:
%%time
file = open('./processeddata/perfect_train_output_5000.pkl','wb')
pickle.dump(train_output,file )
file.close()



CPU times: user 20.3 s, sys: 2.86 s, total: 23.2 s
Wall time: 1min 44s


In [64]:
X_train, X_valid, y_train, y_valid = train_test_split(alltrain, train_output, test_size=0.10, random_state=5)



In [65]:
%%time
import xgboost as xgb
xgdmat=xgb.DMatrix(X_train,y_train)
our_params={'eta':0.1,'seed':0,'subsample':0.8,'colsample_bytree':0.8,'objective':'reg:linear','max_depth':5,'min_child_weight':1}
final_gb=xgb.train(our_params,xgdmat)




CPU times: user 1min 48s, sys: 3.7 s, total: 1min 52s
Wall time: 3min 9s


In [66]:
%%time
tesdmat=xgb.DMatrix(X_train)
y_pred=final_gb.predict(tesdmat)
print(y_pred)

[ 0.29787976  0.22403917  0.2686896  ...,  0.25952417  0.21479341
  0.22403917]
CPU times: user 1.93 s, sys: 1.61 s, total: 3.54 s
Wall time: 4.65 s


In [68]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_train,y_pred)

0.24489483427173372