In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('cus.csv')

In [4]:
df.isnull().sum()

age                   0
gender                0
location              0
source                0
time_spent            0
pages_visited         0
device                0
previous_purchases    0
campaign_engaged      0
converted             0
dtype: int64

In [5]:
df.value_counts()

age  gender  location  source        time_spent  pages_visited  device   previous_purchases  campaign_engaged  converted
69   Male    Urban     Referral      26.50       7              Mobile   5                   0                 1            1
18   Female  Rural     Ad            2.46        7              Mobile   5                   0                 0            1
                                     24.36       15             Desktop  4                   0                 1            1
                       Organic       20.65       10             Mobile   3                   0                 1            1
             Suburban  Organic       7.34        14             Desktop  2                   0                 1            1
                                                                                                                           ..
     Male    Suburban  Ad            3.44        3              Desktop  0                   1                 0           

In [6]:
df['converted'].value_counts(normalize=True)*100

converted
1    81.3
0    18.7
Name: proportion, dtype: float64

In [7]:
df.dtypes

age                     int64
gender                 object
location               object
source                 object
time_spent            float64
pages_visited           int64
device                 object
previous_purchases      int64
campaign_engaged        int64
converted               int64
dtype: object

In [8]:
df = pd.get_dummies(df,columns=['gender','location','source','device'], drop_first=True,)

In [9]:
x = df.drop(['converted'],axis=1)
y = df['converted']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestClassifier(random_state=42)
model.fit(x_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
pred = model.predict(x_test)

In [12]:
x_test.value_counts()

age  time_spent  pages_visited  previous_purchases  campaign_engaged  gender_Male  location_Suburban  location_Urban  source_Organic  source_Referral  source_Social Media  device_Mobile  device_Tablet
18   2.14        10             1                   1                 True         True               False           True            False            False                True           False            1
     7.34        14             2                   0                 False        True               False           True            False            False                False          False            1
     12.15       11             5                   0                 True         True               False           False           False            False                True           False            1
     12.37       8              1                   0                 True         True               False           False           True             False                False    

In [13]:
pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [14]:
print(accuracy_score(y_test,pred))


0.87


In [15]:
print(classification_report(y_test,pred))


              precision    recall  f1-score   support

           0       0.64      0.24      0.35        29
           1       0.88      0.98      0.93       171

    accuracy                           0.87       200
   macro avg       0.76      0.61      0.64       200
weighted avg       0.85      0.87      0.84       200



In [16]:
print(confusion_matrix(y_test,pred))

[[  7  22]
 [  4 167]]


In [17]:
df.tail

<bound method NDFrame.tail of      age  time_spent  pages_visited  previous_purchases  campaign_engaged  \
0     56       22.54              6                   1                 0   
1     69       22.40             12                   3                 0   
2     46       16.12              1                   5                 1   
3     32        2.99             13                   4                 0   
4     60       11.76              7                   3                 0   
..   ...         ...            ...                 ...               ...   
995   60        7.32              3                   0                 0   
996   64       20.19              4                   2                 0   
997   62       19.38             15                   2                 0   
998   35       18.21             14                   3                 1   
999   55        1.47              3                   1                 0   

     converted  gender_Male  location_Suburba

In [18]:
df.tail()

Unnamed: 0,age,time_spent,pages_visited,previous_purchases,campaign_engaged,converted,gender_Male,location_Suburban,location_Urban,source_Organic,source_Referral,source_Social Media,device_Mobile,device_Tablet
995,60,7.32,3,0,0,0,False,False,True,True,False,False,False,False
996,64,20.19,4,2,0,0,True,True,False,False,False,True,True,False
997,62,19.38,15,2,0,1,False,True,False,False,True,False,False,False
998,35,18.21,14,3,1,1,True,True,False,False,False,False,False,False
999,55,1.47,3,1,0,1,True,False,False,True,False,False,False,False


In [19]:
import pickle

In [20]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [21]:
with open('model_columns.pkl', 'wb') as f:
    pickle.dump(x.columns.tolist(), f)