## Cleaning and preparing data for model training

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


### Problem Statement

A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.


In [35]:
# import train data
df_train = pd.read_csv('train.csv')
df_train.head()


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [36]:
df_train.shape

(550068, 12)

In [37]:
# import test data
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [38]:
df_test.shape

(233599, 11)

In [39]:
df = pd.merge(df_train, df_test, how='left')
df.head(20)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


In [40]:
# User_ID is useless so we are dropping it

df.drop(['User_ID'], axis=1, inplace=True)

In [41]:
df.head(20)

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,F,0-17,10,A,2,0,3,,,8370
1,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,P00087842,F,0-17,10,A,2,0,12,,,1422
3,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,P00285442,M,55+,16,C,4+,0,8,,,7969
5,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,P00274942,M,26-35,20,A,1,1,8,,,7871


In [42]:
df.isnull().sum()

Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [43]:
# Handling Gender feature (categorical)

df['Gender'] = df['Gender'].map({'F':0, 'M':1})

In [44]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,0-17,10,A,2,0,3,,,8370
1,P00248942,0,0-17,10,A,2,0,1,6.0,14.0,15200
2,P00087842,0,0-17,10,A,2,0,12,,,1422
3,P00085442,0,0-17,10,A,2,0,12,14.0,,1057
4,P00285442,1,55+,16,C,4+,0,8,,,7969


In [45]:
# Handling Age feature (categorical)
df['Age'].unique()


array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [46]:
# pd.get_dummies(df['Age'], drop_first=True)
df['Age'] = df['Age'].map({'0-17': 1, '18-25':2,	'26-35':3,	'36-45':4,	'46-50':5,	'51-55':6,	'55+':7})

In [47]:
pd.get_dummies(df['Age'], drop_first=False)


Unnamed: 0,1,2,3,4,5,6,7
0,True,False,False,False,False,False,False
1,True,False,False,False,False,False,False
2,True,False,False,False,False,False,False
3,True,False,False,False,False,False,False
4,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...
550063,False,False,False,False,False,True,False
550064,False,False,True,False,False,False,False
550065,False,False,True,False,False,False,False
550066,False,False,False,False,False,False,True


In [48]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd



# Initialize and fit LabelEncoder
le = LabelEncoder()
df['Age']= le.fit_transform(df['Age'])

ModuleNotFoundError: No module named 'sklearn'