# Import Modules

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Black Friday Sales.csv')

In [None]:
# Descripotive analysis

In [None]:
df.head()

In [None]:
# Datatype info
df.info()

In [None]:
# statistical info
df.describe()

In [None]:
# Unique values
df.apply(lambda x: len(x.unique()))

In [None]:
# Null values
df.isnull().sum()

# Exploratory Data Ananalysis

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['Purchase'] ,bins=25);

In [None]:
sns.boxplot(df['Purchase']);

In [None]:
# We can see outliers in Purchase column

In [None]:
# dist of numeric variablres
sns.countplot(df.Gender);

In [None]:
sns.countplot(df.Age);

In [None]:
sns.countplot(df.Marital_Status);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.Occupation);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.Product_Category_1);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.Product_Category_2);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.Product_Category_3);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.City_Category);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df.Stay_In_Current_City_Years);

In [None]:
# Bivariate analysis
occupation_plot = df.pivot_table(index='Occupation', values='Purchase', aggfunc=np.mean)
occupation_plot.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Occupation')
plt.ylabel("Purchase")
plt.title("Occupation and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
age_plot = df.pivot_table(index='Age', values='Purchase', aggfunc=np.mean)
age_plot.plot(kind='bar', figsize=(13, 7))
plt.xlabel('Age')
plt.ylabel("Purchase")
plt.title("Age and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
gender_plot = df.pivot_table(index='Gender', values='Purchase', aggfunc=np.mean)
gender_plot.plot(kind='bar', figsize=(13, 7))
plt.xlabel('Gender')
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
df.head()

# Preprocessing the Dataset

In [None]:
df.isnull().sum()

In [None]:
df.unique().sum()

In [None]:
# Remove outliers using IQR technique

In [None]:
cols = ['Purchase'] 

Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
sns.boxplot(df['Purchase']);

In [None]:
# Droping columns

In [None]:
df['Product_Category_2'] = df['Product_Category_2'].fillna((df['Product_Category_2']).mean()).astype("int32")
df.drop(['Product_Category_3'], axis=1, inplace=True)

# Corealtion Matrix

In [None]:
corr = df.corr()
plt.figure(figsize=(13,6))
sns.heatmap(corr, annot=True);

In [None]:
# df_Gender = pd.get_dummies(train['Gender'])
# df_Age = pd.get_dummies(train['Age'])
# df_City_Category = pd.get_dummies(train['City_Category'])
# df_Stay_In_Current_City_Years = pd.get_dummies(train['Stay_In_Current_City_Years'])

# data_final= pd.concat([train, df_Gender, df_Age, df_City_Category, df_Stay_In_Current_City_Years], axis=1)

# data_final.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()

In [None]:
# df['Gender'] = LE.fit_transform(df['Gender'])
# df['Age'] = LE.fit_transform(df['Age'])
# df['City_Category'] = LE.fit_transform(df['City_Category'])
# df['Stay_In_Current_City_Years'] = LE.fit_transform(df['Stay_In_Current_City_Years'])

In [None]:
df= pd.get_dummies(df, columns = ['Gender', 'Age','City_Category','Stay_In_Current_City_Years','Occupation'])

In [None]:
df.head()

# Input Split

In [None]:
df.head()

In [None]:
X = df.drop(columns=['User_ID', 'Product_ID', 'Purchase'])
y = df['Purchase']
X.info()

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [None]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Training Model selection 

In [None]:
from sklearn.linear_model import LinearRegression
LR=LinearRegression()

In [None]:
LR.fit(X_train,y_train)

In [None]:
y_pred=LR.predict(X_test)

In [None]:
y_pred

In [None]:
print("training score =", LR.score(X_train, y_train))
print("Testing score =", LR.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
#squared True returns MSE value, False returns RMSE value.
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True
rmse = mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False)
  
print("MAE:",mae)
print("MSE:",mse)
print("RMSE:",rmse)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [None]:
rf.fit(X_train,y_train)

In [None]:
print("training score =", rf.score(X_train, y_train))
print("Testing score =", rf.score(X_test, y_test))

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
# score the model on the train set
print('Train score:',LR.score(X_train,y_train))

# score the model on the test set
print('Test score:',LR.score(X_test,y_test))

# calculate the overall accuracy of the model
print('Overall model accuracy:',r2_score(y_test,y_pred))

# compute the mean squared error of the model
print('Mean Squared Error:',mean_squared_error(y_test,y_pred))