In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("social_ads.csv")

In [None]:
data.info()

In [None]:
display(data.nunique())

In [None]:
data.head(5)

In [None]:
data.describe()

In [None]:
data.isnull().sum()

The data is clean and we can go ahead with visualisation

# Data Visualisation

In [None]:
# Visualisation of Age 
plt.figure(figsize=(10, 6))
sns.countplot(x='Age',data=data, color='palevioletred')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualisation of the Estimated Salary
plt.figure(figsize=(10, 6))
sns.histplot(data['EstimatedSalary'], bins=20, kde=True, color='mediumslateblue')
plt.title('Estimated Salary')
plt.xlabel('Estimated Salary')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Calculating mean, median, mode and standard diviation of Estimated Salary:
mean_salary = data['EstimatedSalary'].mean()
median_salary = data['EstimatedSalary'].median()
mode_salary = data['EstimatedSalary'].mode()[0]
std_deviation_salary = data['EstimatedSalary'].std()

In [None]:
#Displaing the mean, median, mode and standard diviation of Estimated Salary:
print("Estimated Salary Mean:", mean_salary)
print("Median Estimated Salary Median:", median_salary)
print("Mode Estimated Salary Mode:", mode_salary)
print("Estimated Salary Standard Deviation:", std_deviation_salary)

In [None]:
# Visualisation Boxplot of Age vs. Purchased
plt.figure(figsize=(10, 6))
sns.boxplot(x='Purchased', y='Age', data=data, palette='Pastel1')
plt.title('Age vs. Purchased Boxplot')
plt.xlabel('Purchased')
plt.ylabel('Age')
plt.show()

In [None]:
# Visualisation of Estimated Salary vs. Purchased
plt.figure(figsize=(10, 5))
sns.boxplot(x='Purchased', y='EstimatedSalary', data=data, palette='pastel')
plt.title('Boxplot of Estimated Salary vs. Purchased')
plt.xlabel('Purchased')
plt.ylabel('Estimated Salary')
plt.show()

In [None]:
# Visualisation of Age vs estimated Salary
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='EstimatedSalary', hue='Purchased', data=data, palette='pastel')
plt.title(' Age vs. Estimated Salary')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()

In [None]:
# Visualisation of all numerical variables
sns.pairplot(data, hue='Purchased', palette='pastel')
plt.suptitle('Pairplot of all numerical variables colored by Purchased', y=1.02)
plt.show()

# Making Predictions

In [None]:
data

In [None]:
# Dropping the Purchase column
x = data.drop(columns ='Purchased', axis=1)

In [None]:
x

In [None]:
y = data['Purchased']

In [None]:
y

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,fbeta_score, r2_score

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Standardising the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Creating and Training Random Forest Classifier Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
#Making prediction on the test data
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
report=classification_report(y_test, y_pred)
print(report)

In [None]:
#Creating and Training Logic Regression Model
logreg = LogisticRegression(random_state=16)
# fit the model with data
logreg.fit(X_train, y_train)
y_pred_logic = logreg.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_logic))

In [None]:
#Creating and Training Decision Tree Classifier Model
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred_tree = clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred_tree = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_tree))

In [None]:
report=classification_report(y_test, y_pred_tree)
print(report)

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_svm = clf.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_svm))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_svm))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_svm))
