In [1]:
#Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
#loading data
df=pd.read_csv('Social_Network_Ads.csv')

In [3]:
df


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
#Column names
df.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [5]:
#Exploratory Data Analysis

In [6]:
#Checking the missing values
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
#Information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [8]:
#Statistical information about the dataset
df.describe()

AttributeError: 'DataFrame' object has no attribute 'describecribe'

In [None]:
#checking correlation
df.corr()

In [None]:
#ploting graphs to check distribution
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
sns.histplot(df, x='Age',hue='Purchased',kde=True)
plt.axvline(x=df['Age'].mean(), color='red')
plt.title('Age Distribution')
plt.show()

In [None]:
#create a new column 'Ag-Category' basis age data
age_bins=[15,20,25,30,35,40,45,50,55,60]
labels=[f"{start}-{end}" for start, end in zip(age_bins[:-1], age_bins[1:])]

In [None]:
df['Age_Category']=pd.cut(df['Age'], bins=age_bins, labels=labels, right =False)

In [None]:
df.sample(3)

In [None]:
#create bins/groups
df.groupby('Age_Category').agg(Total_Purchased_Count=('Purchased','sum')).reset_index()

In [None]:
df.groupby(['Age_Category','Gender']).agg(Total_Purchased_Count=('Purchased', 'sum')).reset_index()

In [None]:
#splitting the data into train test data


In [None]:
df.columns

In [None]:
x=df[['Age', 'EstimatedSalary']]
y=df[['Purchased']]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=.20, random_state=0)

In [None]:
#Feature scaling
sc=StandardScaler()

In [None]:
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)

In [None]:
x_train_df=pd.DataFrame(x_train, columns=['Age','EstimatedSalary'])#because feature scaling converted x_train and y_train into arrays which cannot be plotted and hence converting them back to dataframe

In [None]:
#Training the Decision Tree Model
dt_model=DecisionTreeClassifier(criterion='gini',random_state=20)

In [None]:
dt_model.fit(x_train,y_train)

In [None]:
#Predicting a new result
pred=dt_model.predict(x_test)

In [None]:
#plotting confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
cm=confusion_matrix(y_test,pred)

In [None]:
cm

In [None]:
sns.set(font_scale=1.5)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
#Plot decision tree
from sklearn import tree
tree.plot_tree(dt_model.fit(x_test,y_test))
plt.show()

In [None]:
#plotting gini score in a better way
from sklearn.tree import plot_tree
plt.figure(figsize=(40,20))
plot_tree(dt_model, feature_names=['Age','Estimated_Salary'], impurity ='False', filled='True',fontsize=28)
plt.show()