In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder


In [33]:

# Load the dataset
df = pd.read_csv('/content/Social_Network_Ads.csv')

le = LabelEncoder()

if 'Gender' in df.columns:
    df['Gender'] = le.fit_transform(df['Gender'])

display(df.head())

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [34]:
# Drop the 'User ID' column
if 'User ID' in df.columns:
    df = df.drop(columns=['User ID'])
    print("Column 'User ID' dropped successfully.")
    display(df.head())
else:
    print("Column 'User ID' not found in the DataFrame.")

Column 'User ID' dropped successfully.


Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [35]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Purchased', axis=1)
y = df['Purchased']

# Split the data into training and testing sets (70-30 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (280, 3)
X_test shape: (120, 3)
y_train shape: (280,)
y_test shape: (120,)


In [36]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns to scale (excluding 'Gender')
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.drop('Gender', errors='ignore')

# Initialize StandardScaler
scaler = StandardScaler()

# Apply scaler to numerical columns of X_train
X_train_scaled_numerical = scaler.fit_transform(X_train[numerical_cols])

# Apply scaler to numerical columns of X_test using the same scaler fitted on X_train
X_test_scaled_numerical = scaler.transform(X_test[numerical_cols])

# Create DataFrames for the scaled numerical features
X_train_scaled_numerical_df = pd.DataFrame(X_train_scaled_numerical, columns=numerical_cols, index=X_train.index)
X_test_scaled_numerical_df = pd.DataFrame(X_test_scaled_numerical, columns=numerical_cols, index=X_test.index)

# Recombine 'Gender' column with scaled numerical features
X_train_scaled_df = pd.concat([X_train['Gender'], X_train_scaled_numerical_df], axis=1)
X_test_scaled_df = pd.concat([X_test['Gender'], X_test_scaled_numerical_df], axis=1)

print("X_train_scaled (first 5 rows with 'Gender' unscaled):")
display(X_train_scaled_df.head())

print("\nX_test_scaled (first 5 rows with 'Gender' unscaled):")
display(X_test_scaled_df.head())

X_train_scaled (first 5 rows with 'Gender' unscaled):


Unnamed: 0,Gender,Age,EstimatedSalary
157,1,-0.842522,0.130156
109,0,0.041758,0.277702
17,1,0.72953,-1.315791
347,0,1.613809,1.103957
24,1,0.827784,-1.404318



X_test_scaled (first 5 rows with 'Gender' unscaled):


Unnamed: 0,Gender,Age,EstimatedSalary
209,0,0.827784,-1.433827
280,0,2.105076,0.513775
33,0,-0.940775,-0.784626
210,0,1.02429,0.749848
93,0,-0.842522,-1.256772


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Train the model using the scaled training data
log_reg_model.fit(X_train_scaled_df, y_train)

# Make predictions on the scaled test data
y_pred = log_reg_model.predict(X_test_scaled_df)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nconfusion matrix\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8583333333333333

confusion matrix
 [[71  2]
 [15 32]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89        73
           1       0.94      0.68      0.79        47

    accuracy                           0.86       120
   macro avg       0.88      0.83      0.84       120
weighted avg       0.87      0.86      0.85       120



In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the KNN classifier (e.g., with n_neighbors=5)
knn_model = KNeighborsClassifier(n_neighbors=7)

# Train the model using the scaled training data
knn_model.fit(X_train_scaled_df, y_train)

# Make predictions on the scaled test data
y_pred_knn = knn_model.predict(X_test_scaled_df)

# Evaluate the model's performance
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nKNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.925

KNN Confusion Matrix:
 [[68  5]
 [ 4 43]]

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94        73
           1       0.90      0.91      0.91        47

    accuracy                           0.93       120
   macro avg       0.92      0.92      0.92       120
weighted avg       0.93      0.93      0.93       120



In [39]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

print("--- Logistic Regression Metrics ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision (macro avg): {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall (macro avg): {recall_score(y_test, y_pred, average='macro')}")
print(f"F1 Score (macro avg): {f1_score(y_test, y_pred, average='macro')}")

print("\n--- K-Nearest Neighbors (KNN) Metrics ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)}")
print(f"Precision (macro avg): {precision_score(y_test, y_pred_knn, average='macro')}")
print(f"Recall (macro avg): {recall_score(y_test, y_pred_knn, average='macro')}")
print(f"F1 Score (macro avg): {f1_score(y_test, y_pred_knn, average='macro')}")

--- Logistic Regression Metrics ---
Accuracy: 0.8583333333333333
Precision (macro avg): 0.8833789329685362
Recall (macro avg): 0.8267269017779073
F1 Score (macro avg): 0.8416026088982064

--- K-Nearest Neighbors (KNN) Metrics ---
Accuracy: 0.925
Precision (macro avg): 0.9201388888888888
Recall (macro avg): 0.9232002331681726
F1 Score (macro avg): 0.9215970961887477


In [41]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Initialize the SVM classifier with a linear kernel
svm_linear_model = SVC(kernel='rbf', random_state=42)

# Train the model using the scaled training data
svm_linear_model.fit(X_train_scaled_df, y_train)

# Make predictions on the scaled test data
y_pred_svm_linear = svm_linear_model.predict(X_test_scaled_df)

# Evaluate the model's performance and print the requested metrics

print(f"Accuracy: {accuracy_score(y_test, y_pred_svm_linear)}")
print(f"Precision (macro avg): {precision_score(y_test, y_pred_svm_linear, average='macro')}")
print(f"Recall (macro avg): {recall_score(y_test, y_pred_svm_linear, average='macro')}")
print(f"F1 Score (macro avg): {f1_score(y_test, y_pred_svm_linear, average='macro')}")

Accuracy: 0.9333333333333333
Precision (macro avg): 0.9278528312733544
Recall (macro avg): 0.933838531040513
F1 Score (macro avg): 0.9305555555555556


In [42]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Metrics for Logistic Regression (assuming y_pred is from the last Logistic Regression run)
log_reg_accuracy = accuracy_score(y_test, y_pred)
log_reg_precision = precision_score(y_test, y_pred, average='macro')
log_reg_recall = recall_score(y_test, y_pred, average='macro')
log_reg_f1 = f1_score(y_test, y_pred, average='macro')

# Metrics for K-Nearest Neighbors (KNN)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn, average='macro')
knn_recall = recall_score(y_test, y_pred_knn, average='macro')
knn_f1 = f1_score(y_test, y_pred_knn, average='macro')

# Metrics for RBF SVM
svm_rbf_accuracy = accuracy_score(y_test, y_pred_svm_linear)
svm_rbf_precision = precision_score(y_test, y_pred_svm_linear, average='macro')
svm_rbf_recall = recall_score(y_test, y_pred_svm_linear, average='macro')
svm_rbf_f1 = f1_score(y_test, y_pred_svm_linear, average='macro')

# Create the DataFrame
results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Nearest Neighbors (KNN)', 'RBF SVM'],
    'Test Accuracy': [log_reg_accuracy, knn_accuracy, svm_rbf_accuracy],
    'Test Precision': [log_reg_precision, knn_precision, svm_rbf_precision],
    'Test Recall': [log_reg_recall, knn_recall, svm_rbf_recall],
    'Test F1-Score': [log_reg_f1, knn_f1, svm_rbf_f1]
})

display(results_df)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.858333,0.883379,0.826727,0.841603
1,K-Nearest Neighbors (KNN),0.925,0.920139,0.9232,0.921597
2,RBF SVM,0.933333,0.927853,0.933839,0.930556


SVM is better because it provide better recall values