## Black Friday Dataset EDA And Feature Engineering

### Cleaning and preparing the data for model training


In [None]:
## dataset link: https://www.kaggle.com/sdolezel/black-friday?select=train.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

# Problem Statement

A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.


In [None]:
# importing the dataset
df_train = pd.read_csv('black_friday_dataset/dataset/train.csv')
df_train.head()

In [None]:
# import the test data
df_test = pd.read_csv('black_friday_dataset/dataset/test.csv')
df_test.head()

In [None]:
# MErge both train and test data
df = df_train.append(df_test)
df.head()

In [None]:
# Basic
df.info()

In [None]:
df.describe()

In [None]:
df.drop(['User_ID'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['Gender'] = pd.get_dummies(df['Gender'], drop_first=1)

In [None]:
# HAndling categorical feature Gender
df['Gender'] = df['Gender'].map({'F': 0, 'M': 1})
df.head()

In [None]:
# Handle categorical feature Age
df['Age'].unique()

In [None]:
# pd.get_dummies(df['Age'],drop_first=True)
df['Age'] = df['Age'].map(
    {'0-17': 1, '18-25': 2, '26-35': 3, '36-45': 4, '46-50': 5, '51-55': 6, '55+': 7})

In [None]:
# second technqiue
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
df['Age'] = label_encoder.fit_transform(df['Age'])

df['Age'].unique()

In [None]:
df.head()

In [None]:
# fixing categorical City_categort
df_city = pd.get_dummies(df['City_Category'], drop_first=True)

In [None]:
df_city.head()

In [None]:
df = pd.concat([df, df_city], axis=1)
df.head()

In [None]:
# drop City Category Feature
df.drop('City_Category', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# Missing Values
df.isnull().sum()

In [None]:
# Focus on replacing missing values
df['Product_Category_2'].unique()

In [None]:
df['Product_Category_2'].value_counts()

In [None]:
df['Product_Category_2'].mode()[0]

In [None]:
# Replace the missing values with mode
df['Product_Category_2'] = df['Product_Category_2'].fillna(
    df['Product_Category_2'].mode()[0])

In [None]:
df['Product_Category_2'].isnull().sum()

In [None]:
# Product_category 3 replace missing values
df['Product_Category_3'].unique()

In [None]:
df['Product_Category_3'].value_counts()

In [None]:
# Replace the missing values with mode
df['Product_Category_3'] = df['Product_Category_3'].fillna(
    df['Product_Category_3'].mode()[0])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['Stay_In_Current_City_Years'].unique()

In [None]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(
    'str')
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].str.replace(
    '+', '', regex=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# convert object into integers
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)
df.info()

In [None]:
df['B'] = df['B'].astype(int)
df['C'] = df['C'].astype(int)

In [None]:
df.info(memory_usage='deep')

In [None]:
# Visualisation Age vs Purchased

# Fill the Purchase column with zeros
df['Purchase'].fillna(0, inplace=True)

# Convert the Purchase column to int64
df['Purchase'] = df['Purchase'].astype('int64')

# Create a bar plot of the Age and Purchase columns, with the Gender column used to color the bars
fig = sns.barplot(x='Age', y='Purchase', hue='Gender', data=df)

# Get the minimum value of the Purchase column
min_purchase = df['Purchase'].min()

# Add a text annotation to the plot showing the minimum value
plt.annotate(f"Minimum Purchase: {min_purchase}",
             (0, min_purchase), xycoords='data')

# Show the plot
fig.show()

## Purchasing of men is high then women


In [None]:
# Visualization of Purchase with occupation
sns.barplot(df, x='Occupation', y='Purchase', hue='Gender')

In [None]:
sns.barplot('Product_Category_1', 'Purchase', hue='Gender', data=df)

In [None]:
sns.barplot('Product_Category_2', 'Purchase', hue='Gender', data=df)

In [None]:
sns.barplot('Product_Category_3', 'Purchase', hue='Gender', data=df)

In [None]:
df.head()

In [None]:
# Feature Scaling
df_test = df[df['Purchase'].isnull()]

In [None]:
df_train = df[~df['Purchase'].isnull()]

In [None]:
X = df_train.drop('Purchase', axis=1)

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y = df_train['Purchase']

In [None]:
y.shape

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
X_train.drop('Product_ID', axis=1, inplace=True)
X_test.drop('Product_ID', axis=1, inplace=True)

In [None]:
# feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# train ur model