The notebook can be viewed here:
https://colab.research.google.com/drive/1o5wnbv30yTb80u5qNP9wlMKwjBX8ZS6I?usp=sharing

In [67]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [68]:
df = pd.read_csv(r"https://raw.githubusercontent.com/awinml/ml-prep/main/week2/examples/Social_Network_Ads.csv")

# Data Sanity Checks

In [69]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [70]:
df.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [71]:
df.duplicated().sum()

0

In [72]:
df.shape

(400, 5)

In [73]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


# Correctly Splitting the data ensuring there is no leakage

In [75]:
# Split the data into training and testing sets
X = df.drop(["Purchased", "User ID"], axis=1)
y = df["Purchased"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=101, stratify=y
)


In [76]:
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
298,Male,45,79000
13,Male,32,18000
301,Male,48,74000
53,Female,35,23000
61,Male,25,87000
...,...,...,...
196,Female,30,79000
193,Male,19,70000
121,Male,37,72000
74,Male,32,18000


In [77]:
y_train

298    0
13     0
301    1
53     0
61     0
      ..
196    0
193    0
121    0
74     0
294    0
Name: Purchased, Length: 320, dtype: int64

# Why stratify?

In [78]:
# Split the data into training and testing sets
X = df.drop(["Purchased", "User ID"], axis=1)
y = df["Purchased"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=101, stratify=y
)

In [79]:
y.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [80]:
y_train.value_counts()

0    206
1    114
Name: Purchased, dtype: int64

In [81]:
y_test.value_counts()

0    51
1    29
Name: Purchased, dtype: int64

# Data Preprocessing

In [82]:
scaler = StandardScaler()
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

In [83]:
categorical_encoder.fit(X_train.Gender.values.reshape(-1,1))
X_train_Gender_transformed = categorical_encoder.transform(X_train.Gender.values.reshape(-1,1))

In [84]:
# Before Transformation
X_train.Gender.values[:10]

array(['Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male',
       'Female', 'Female'], dtype=object)

In [85]:
# After Transformation
X_train_Gender_transformed.toarray()[:10]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [86]:
scaler.fit(X_train.EstimatedSalary.values.reshape(-1,1))
X_train_EstimatedSalary_transformed = scaler.transform(X_train.EstimatedSalary.values.reshape(-1,1))

In [87]:
X_train.EstimatedSalary.values[:10]

array([ 79000,  18000,  74000,  23000,  87000, 144000, 106000,  73000,
        71000,  76000])

In [88]:
X_train_EstimatedSalary_transformed[:10]

array([[ 0.27464724],
       [-1.50173253],
       [ 0.12904234],
       [-1.35612763],
       [ 0.50761508],
       [ 2.16751093],
       [ 1.0609137 ],
       [ 0.09992136],
       [ 0.0416794 ],
       [ 0.1872843 ]])

# Using a Sklearn Pipeline to the preprocessing

In [91]:
# Define preprocessing steps
scaler = StandardScaler()
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
scale_cols = ["Age", "EstimatedSalary"]
cat_cols = ["Gender"]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_encoder, cat_cols),
        ("scale", scaler, scale_cols),
    ]
)


# Define the pipeline
clf = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    #("classifier", GaussianNB()),
    ("classifier", RandomForestClassifier(n_estimators=150, n_jobs=-1)),
    ])
clf.fit(X_train, y_train)

In [92]:
# Make predictions on the test set and evaluate the model
y_pred = clf.predict(X_test)
acc = round(accuracy_score(y_test, y_pred), 3)
print(f"Accuracy score: {acc*100}%")
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy score: 88.8%
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.90      0.91        51
           1       0.83      0.86      0.85        29

    accuracy                           0.89        80
   macro avg       0.88      0.88      0.88        80
weighted avg       0.89      0.89      0.89        80

