In [117]:
import pandas as pd
import numpy as np


In [118]:
df = pd.read_csv("car_data.csv")
df.sample(5)

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
504,490,Male,40,59000,0
825,842,Male,30,80000,0
786,484,Male,37,126500,1
851,62,Male,25,20500,0
836,732,Female,31,118000,1


In [119]:
#Some info about our attributes and its datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User ID       1000 non-null   int64 
 1   Gender        1000 non-null   object
 2   Age           1000 non-null   int64 
 3   AnnualSalary  1000 non-null   int64 
 4   Purchased     1000 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 39.2+ KB


In [123]:
# preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn import set_config
set_config(display='diagram')
transformer = ColumnTransformer(transformers=[('Encoder',OneHotEncoder(drop='first'),['Gender']),('Yeo-Johnson',PowerTransformer(),['Age','AnnualSalary'])])

In [124]:
# dataset splitting
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(['Purchased'],axis=1),df['Purchased'],random_state=42,stratify=df['Purchased'],shuffle=True)

In [125]:
X_train=transformer.fit_transform(X_train)

In [126]:
X_test=transformer.transform(X_test)

In [128]:
# OverSampling of Minority Class using SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
X_test,y_test = sm.fit_resample(X_test,y_test)

In [130]:
#logistic regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

param_grid={'C':[0.001,0.01,0.1,1,10,100], 'max_iter':[50,75,100,200,300,400,500,700]}
log=RandomizedSearchCV(LogisticRegression(solver='lbfgs'),param_grid,cv=5)
log.fit(X_train,y_train)
y_pred_log=log.predict(X_test)
confusion_log=confusion_matrix(y_test,log.predict(X_test))
print(classification_report(y_test,y_pred_log))


              precision    recall  f1-score   support

           0       0.84      0.83      0.84       150
           1       0.83      0.85      0.84       150

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300



In [131]:
# save the model
import pickle
pickle.dump(log, open('model.pickle', 'wb'))
pickle.dump(transformer, open('preprocessor.pickle', 'wb'))

In [136]:
# prediction on random data by the saved model
from pydantic import BaseModel
class Data(BaseModel):
    Gender: list
    Age: list
    AnnualSalary: list

data_input =  {'Gender' : ['Male','Female'], 'Age' : [50,40], 'AnnualSalary' : [62000,30000]}

input = Data(**data_input).model_dump()

to_predict = pd.DataFrame(input)

clf = pickle.load(open('model.pickle', 'rb'))
pre_processor = pickle.load(open('preprocessor.pickle', 'rb'))

clf.predict(pre_processor.transform(to_predict))

array([1, 0])