In [None]:
%pip install -r requirements.txt

In [None]:
# dataset link: https://www.kaggle.com/sdolezel/black-friday?select=train.csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib
%matplotlib inline

### Problem Statement
A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

In [None]:
# Append test and train data using concat
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.size

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# Drop product_id and user_id columns as they would not useful for our analysis
df.drop(['Product_ID', 'User_ID'], axis=1, inplace=True)


In [None]:
df.head()

In [None]:
# Encode categorical variable, Gender
df['Gender'] = df['Gender'].map({'F': 0, 'M': 1})
df.head()

In [None]:
df['Age'].unique()

In [None]:
# Encode categorical variable, Age
df['Age'] = df['Age'].map({'0-17': 1, '18-25': 2, '26-35': 3, '36-45': 4, '46-50': 5, '51-55': 6, '55+': 7})
df.head()

In [None]:
# Encode categorical variable, City_Category
# df['City_Category'] = df['City_Category'].map({'A': 0, 'B': 1, 'C': 2})
# If there are n categories, we can use n-1 dummy variables to represent them. This is one hot encoding. 
df_city = pd.get_dummies(df['City_Category'], drop_first=True)
df_city.head()

In [None]:
df = pd.concat([df, df_city], axis=1)
df.drop(['City_Category'], axis=1, inplace=True)
df.head()

In [None]:
# Check for missing values again
df.isnull().sum()

### Observation
1. Purchase has null values because it has test data also.
2. We should focus on Product_Category_2 and Product_Category_3

In [None]:
# Focus on the Product_Category_2 and Product_Category_3 columns
df['Product_Category_2'].unique()

In [None]:
df['Product_Category_2'].value_counts().sort_index()

In [None]:
# Replace missing values with mode
df['Product_Category_2'].mode()

In [None]:
df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
df['Product_Category_2'].unique()

In [None]:
df['Product_Category_2'].isnull().sum()

In [None]:
# Do same for Product_Category_3
df['Product_Category_3'].mode()

In [None]:
df['Product_Category_3'] = df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])
df['Product_Category_3'].isnull().sum()

In [None]:
# Now work on Stay_In_Current_City_Years
df['Stay_In_Current_City_Years'].unique()

In [None]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].str.replace('+', '')
df['Stay_In_Current_City_Years'].unique()

In [None]:
df.info()

### Observation
1. Stay_In_Current_City_Years is still an object so we need to convert it into int.

In [None]:
# Convert Stay_In_Current_City_Years to int
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)
df.info()

In [None]:
# Convert B and C to 0 and 1 and of type int
df['B'] = df['B'].map({False: 0, True: 1})
df['C'] = df['C'].map({False: 0, True: 1})
df.head()

In [None]:
df['B'] = df['B'].astype(int)
df['C'] = df['C'].astype(int)
df.info()

In [None]:
# Visualize the Age vs Purchase
plt.figure(figsize=(10, 6))
sns.barplot(x='Age', y='Purchase', data=df, hue='Gender')

### Observation
1. Females purchase more than Males.

In [None]:
# Visualize the Purchase vs Occupation
plt.figure(figsize=(12, 8))
sns.barplot(x='Occupation', y='Purchase', data=df,  hue='Gender')

In [None]:
# Visualize the Product_Category_1 vs Purchase
plt.figure(figsize=(12, 8))
sns.barplot(x='Product_Category_1', y='Purchase', data=df,  hue='Gender')

In [None]:
# Visualize the Product_Category_2 vs Purchase
plt.figure(figsize=(12, 8))
sns.barplot(x='Product_Category_2', y='Purchase', data=df,  hue='Gender')

In [None]:
# Visualize the Product_Category_3 vs Purchase
plt.figure(figsize=(12, 8))
sns.barplot(x='Product_Category_3', y='Purchase', data=df,  hue='Gender')

In [None]:
# Filter out the test and train data
# Test data has null values in Purchase column
df_test = df[df['Purchase'].isnull()]
df_train = df[df['Purchase'].notnull()]

In [None]:
print(df_train.shape, df_test.shape)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
# Train Test split
from sklearn.model_selection import train_test_split
X = df_train.drop(['Purchase'], axis=1)
y = df_train['Purchase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape