In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Social_Network_Ads_Synthetic.csv')

# Display the first few rows of the dataset
print(df.head())


    User ID  Gender  Age  EstimatedSalary  Purchased
0  75682867    Male   26            47591          0
1  66755036    Male   55           108583          0
2  66882282    Male   39           115318          1
3  31081788  Female   19            24833          1
4  23315092  Female   41            22896          1


## Data Preprocessing
2.1: Check for Missing Values
It's important to ensure there are no missing values in the dataset. We can check for them using isnull().

In [8]:
# Check for missing values
print(df.isnull().sum())


User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64


## Encode Categorical Data
We need to convert the Gender column (which contains text) into numerical values because machine learning models can only understand numbers.

We'll use Label Encoding to convert Male to 0 and Female to 1.

In [9]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
labelencoder = LabelEncoder()

# Encode 'Gender' column
df['Gender'] = labelencoder.fit_transform(df['Gender'])

# Display the updated dataset
print(df.head())


    User ID  Gender  Age  EstimatedSalary  Purchased
0  75682867       1   26            47591          0
1  66755036       1   55           108583          0
2  66882282       1   39           115318          1
3  31081788       0   19            24833          1
4  23315092       0   41            22896          1


## Feature Scaling
Since the dataset includes numerical features like Age and EstimatedSalary, it's important to scale these values so they are on a similar scale. This is often done using Standardization or Normalization. Here, we will use Standardization using StandardScaler.

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the 'Age' and 'EstimatedSalary' columns
df[['Age', 'EstimatedSalary']] = scaler.fit_transform(df[['Age', 'EstimatedSalary']])

# Display the updated dataset
print(df.head())


    User ID  Gender       Age  EstimatedSalary  Purchased
0  75682867       1 -1.033356        -0.599924          0
1  66755036       1  1.366719         1.402438          0
2  66882282       1  0.042539         1.623548          1
3  31081788       0 -1.612685        -1.347067          1
4  23315092       0  0.208062        -1.410659          1


## Split the Dataset
We'll split the dataset into:

Features (X) — the input variables: Gender, Age, EstimatedSalary

Target (y) — the output variable: Purchased

Then we split the data into a training set and a test set using train_test_split.

In [11]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df[['Gender', 'Age', 'EstimatedSalary']]  # Features
y = df['Purchased']                           # Target

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to confirm
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Test Labels Shape:", y_test.shape)


Training Features Shape: (800, 3)
Test Features Shape: (200, 3)
Training Labels Shape: (800,)
Test Labels Shape: (200,)


## Train the Logistic Regression Model
We'll use Logistic Regression from sklearn.linear_model to classify whether a user will purchase the product based on their gender, age, and estimated salary.



In [13]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
classifier = LogisticRegression()

# Train the model
classifier.fit(X_train, y_train)

# Predict on test set
y_pred = classifier.predict(X_test)


## Evaluate Model with Confusion Matrix & Metrics
We will:

Generate the confusion matrix

Calculate:

True Positives (TP)

True Negatives (TN)

False Positives (FP)

False Negatives (FN)

Compute:

Accuracy

Error Rate

Precision

Recall



In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Extract TP, TN, FP, FN
TN, FP, FN, TP = cm.ravel()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Display results
print("Confusion Matrix:")
print(cm)
print(f"\nTrue Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"\nAccuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Confusion Matrix:
[[62 34]
 [67 37]]

True Positives (TP): 37
True Negatives (TN): 62
False Positives (FP): 34
False Negatives (FN): 67

Accuracy: 0.49
Error Rate: 0.51
Precision: 0.52
Recall: 0.36
