In [241]:
#sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
import pandas as pd

import numpy as np

#File system 
import os

#Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

dtrain = pd.read_csv('train.csv')
print("Train Data Shape:",dtrain.shape)
#dtrain = dtrain.iloc[:,2:]
train_product_id = dtrain['Product_ID']
dtrain['Product_ID'] = dtrain['Product_ID'].str[1:].astype(int)
dtrain.head()


Train Data Shape: (550068, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,69042,F,0-17,10,A,2,0,3,,,8370
1,1000001,248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,87842,F,0-17,10,A,2,0,12,,,1422
3,1000001,85442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,285442,M,55+,16,C,4+,0,8,,,7969


In [242]:
dtest = pd.read_csv('test.csv')
print("Test Data Shape:",dtest.shape)
#dtest = dtest.iloc[:,2:]
test_product_id = dtest['Product_ID']
dtest['Product_ID'] = dtest['Product_ID'].str[1:].astype(int)
dtest.head()


Test Data Shape: (233599, 11)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,53842,F,26-35,1,C,1,0,4,5.0,12.0


In [243]:
# Function to Calculate Missing Values
def missing_values_table(df):
    
    #Total missing value
    misval = df.isnull().sum()
    
    #Percentage of missing values
    misvalper = 100 * df.isnull().sum() / len(df)
    
    #Create Table
    misvaltable = pd.concat([misval,misvalper],axis=1)
    
    #Rename Columns
    renamecol = misvaltable.rename(
        columns = {0:"Missing Values", 1:"Percentage of Total Values"}
    )
    
    #Sort the table
    renamecol = renamecol[renamecol.iloc[:,1] != 0].sort_values('Percentage of Total Values', ascending=False).round(1)
    
    #Print summary
    print("Data has "+ str(df.shape[1]) + "columns. \n" "Missing value in columns are" + str(renamecol.shape[0]))
    
    return renamecol

    
    

In [244]:
#Train Dataset Missing Values
missing_values = missing_values_table(dtrain)
missing_values.head(20)

Data has 12columns. 
Missing value in columns are2


Unnamed: 0,Missing Values,Percentage of Total Values
Product_Category_3,383247,69.7
Product_Category_2,173638,31.6


In [245]:
#Test Dataset Missing Values
missing_values = missing_values_table(dtest)
missing_values.head(20)

Data has 11columns. 
Missing value in columns are2


Unnamed: 0,Missing Values,Percentage of Total Values
Product_Category_3,162562,69.6
Product_Category_2,72344,31.0


In [246]:
dtrain.fillna(0, inplace=True)
dtest.fillna(0, inplace=True)

In [247]:
#Label Encoder
le = LabelEncoder()
le_count = 0


for col in dtrain:
    if dtrain[col].dtype == 'object':
        if len(list(dtrain[col].unique())) <=8 :
            le.fit(dtrain[col])
            # Transform both Training and Testing Data
            dtrain[col] = le.transform(dtrain[col])
            dtest[col] = le.transform(dtest[col])
            
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


## Feature Generation 

In [248]:
cat1_purchase_mean =  dtrain.groupby('Product_Category_1')['Purchase'].mean().astype(int)
cat2_purchase_mean = dtrain.groupby('Product_Category_2')['Purchase'].mean().astype(int)
cat3_purchase_mean = dtrain.groupby('Product_Category_3')['Purchase'].mean().astype(int)
product_purchase_mean = dtrain.groupby('Product_ID')['Purchase'].mean().astype(int)
user_purchase_mean = dtrain.groupby('User_ID')['Purchase'].mean().astype(int)

dtrain["Product_Category_1_mean"] =  list(cat1_purchase_mean[dtrain['Product_Category_1']])
dtrain["Product_Category_2_mean"] =  list(cat2_purchase_mean[dtrain['Product_Category_2']])
dtrain["Product_Category_3_mean"] =  list(cat3_purchase_mean[dtrain['Product_Category_3']])
dtrain["Product_ID_mean"] =  list(product_purchase_mean[dtrain['Product_ID']])
dtrain["User_ID_mean"] =  list(user_purchase_mean[dtrain['User_ID']])



dtest["Product_Category_1_mean"] =  list(cat1_purchase_mean[dtest['Product_Category_1']])
dtest["Product_Category_2_mean"] =  list(cat2_purchase_mean[dtest['Product_Category_2']])
dtest["Product_Category_3_mean"] =  list(cat3_purchase_mean[dtest['Product_Category_3']])
#dtest["Product_ID_mean"] =  list(product_purchase_mean[dtest['Product_ID']])
dtest["User_ID_mean"] =  list(user_purchase_mean[dtest['User_ID']])


dtest.head()


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Product_Category_1_mean,Product_Category_2_mean,Product_Category_3_mean,User_ID_mean
0,1000004,128942,1,4,7,1,2,1,1,11.0,0.0,13606,8940,8221,14747
1,1000009,113442,1,2,17,2,0,0,3,5.0,0.0,10096,9027,8221,10243
2,1000010,288442,0,3,1,1,4,1,5,14.0,0.0,6240,7105,8221,9728
3,1000010,145342,0,3,1,1,4,1,4,9.0,0.0,2329,7277,8221,9728
4,1000011,53842,0,2,1,2,1,0,4,5.0,12.0,2329,9027,8715,7957


In [249]:
user_cat2_purchase_mean = dtrain.groupby(['User_ID','Product_Category_2'])['Purchase'].mean().astype(int)
user_cat2_map = {}
for key, value in user_cat2_purchase_mean.iteritems():
    p_id = str(key[0]) + str(key[1])
    user_cat2_map[p_id] = value


def get_user_cat2_purchase_mean(user_id,cat_id):
    key_pair_pid = str(user_id) + str(cat_id)
    if key_pair_pid in user_cat2_map:
        avg_mean = user_cat2_map[key_pair_pid]
    else:
        avg_mean = user_purchase_mean[user_id]
    return avg_mean

dtrain["User_Cat2_Mean"] = list(map(lambda user_id,cat_id: get_user_cat2_purchase_mean(user_id,cat_id), dtrain["User_ID"],dtrain["Product_Category_2"]))
dtest["User_Cat2_Mean"] = list(map(lambda user_id,cat_id: get_user_cat2_purchase_mean(user_id,cat_id), dtest["User_ID"],dtest["Product_Category_2"]))


In [250]:
user_cat1_purchase_mean = dtrain.groupby(['User_ID','Product_Category_1'])['Purchase'].mean().astype(int)
user_cat_map = {}
for key, value in user_cat1_purchase_mean.iteritems():
    p_id = str(key[0]) + str(key[1])
    user_cat_map[p_id] = value


def get_user_cat_purchase_mean(user_id,cat_id):
    key_pair_pid = str(user_id) + str(cat_id)
    if key_pair_pid in user_cat_map:
        avg_mean = user_cat_map[key_pair_pid]
    else:
        avg_mean = user_purchase_mean[user_id]
    return avg_mean

dtrain["User_Cat_Mean"] = list(map(lambda user_id,cat_id: get_user_cat_purchase_mean(user_id,cat_id), dtrain["User_ID"],dtrain["Product_Category_1"]))
dtest["User_Cat_Mean"] = list(map(lambda user_id,cat_id: get_user_cat_purchase_mean(user_id,cat_id), dtest["User_ID"],dtest["Product_Category_1"]))



In [251]:
user_cat1_purchase_mean = dtrain.groupby(['User_ID','Product_Category_3'])['Purchase'].mean().astype(int)
user_cat_map = {}
for key, value in user_cat1_purchase_mean.iteritems():
    p_id = str(key[0]) + str(key[1])
    user_cat_map[p_id] = value


def get_user_cat_purchase_mean(user_id,cat_id):
    key_pair_pid = str(user_id) + str(cat_id)
    if key_pair_pid in user_cat_map:
        avg_mean = user_cat_map[key_pair_pid]
    else:
        avg_mean = user_purchase_mean[user_id]
    return avg_mean

dtrain["User_Cat3_Mean"] = list(map(lambda user_id,cat_id: get_user_cat_purchase_mean(user_id,cat_id), dtrain["User_ID"],dtrain["Product_Category_3"]))
dtest["User_Cat3_Mean"] = list(map(lambda user_id,cat_id: get_user_cat_purchase_mean(user_id,cat_id), dtest["User_ID"],dtest["Product_Category_3"]))


In [253]:
product_purchase_mean = dtrain.groupby('Product_ID')['Purchase'].mean().astype(int)
product_purchase_map = {}
for key, value in product_purchase_mean.iteritems():
    p_id = str(key)
    product_purchase_map[p_id] = value


def get_product_purchase_mean(product_id,cat_id):
    key_pair_pid = str(product_id)
    if key_pair_pid in product_purchase_map:
        avg_mean = product_purchase_map[key_pair_pid]
    else:
        avg_mean = cat1_purchase_mean[cat_id]
    return avg_mean


dtest["Product_ID_mean"] = list(map(lambda product_id,cat_id: get_product_purchase_mean(product_id,cat_id), dtest["Product_ID"],dtest["Product_Category_1"]))


In [254]:

dtrain_onehot = dtrain
dtest_onehot = dtest
print(dtrain_onehot.shape)
print(dtest_onehot.shape)


(550068, 20)
(233599, 19)


In [255]:
missing_values = missing_values_table(dtest_onehot)
missing_values.head(20)

Data has 19columns. 
Missing value in columns are0


Unnamed: 0,Missing Values,Percentage of Total Values


In [256]:
correlations = dtrain_onehot.corr()['Purchase'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

Most Positive Correlations:
 Age                        0.015839
Occupation                 0.020833
Product_Category_2         0.052288
Gender                     0.060346
City_Category              0.061914
Product_Category_3         0.288501
User_ID_mean               0.322415
Product_Category_3_mean    0.339881
Product_Category_2_mean    0.396059
User_Cat3_Mean             0.522063
User_Cat2_Mean             0.585910
Product_Category_1_mean    0.797843
Product_ID_mean            0.849659
User_Cat_Mean              0.856554
Purchase                   1.000000
Name: Purchase, dtype: float64

Most Negative Correlations:
 Product_Category_1           -0.343703
Product_ID                   -0.103961
Marital_Status               -0.000463
User_ID                       0.004716
Stay_In_Current_City_Years    0.005422
Age                           0.015839
Occupation                    0.020833
Product_Category_2            0.052288
Gender                        0.060346
City_Category      

In [257]:
train_labels = dtrain_onehot['Purchase']
#train_labels = dtrain_onehot['Purchase']

# Align the training and testing data, keep only columns present in both dataframes
dtrain_onehot, dtest_onehot = dtrain_onehot.align(dtest_onehot, join='inner', axis=1)

#add the target back in
print('Training Features shape:', dtrain_onehot.shape)
print('Testing Features shape:', dtest_onehot.shape)

Training Features shape: (550068, 19)
Testing Features shape: (233599, 19)


In [220]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dtrain_onehot, train_labels, test_size = 0.3, random_state = 0)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, y_pred))
rms

2265.4589862799876

In [258]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(dtrain_onehot, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [259]:
y_pred = regressor.predict(dtest_onehot)

In [260]:
dtest_onehot['Purchase'] = y_pred
dtest_onehot['Product_ID'] = test_product_id
dtest_onehot.to_csv('blcakfriday.csv', columns=['User_ID', 'Product_ID', 'Purchase'], index=False)