In [5]:
import pandas as pd

data = pd.read_csv('Ad Click Data.csv')
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0.0,Tunisia,3/27/2016 0:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1.0,Nauru,4/4/2016 1:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0.0,San Marino,3/13/2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1.0,Italy,1/10/2016 2:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0.0,Iceland,6/3/2016 3:36,0


In [6]:
#look at the data to see if it has null values
print(data.isnull().sum())

Daily Time Spent on Site    13
Age                          0
Area Income                 13
Daily Internet Usage        11
Ad Topic Line                0
City                         1
Male                         3
Country                      9
Timestamp                    0
Clicked on Ad                0
dtype: int64


In [7]:
# Replace missing values with the mode of the column (most frequent value)
data_mode_fill = data.fillna(data.mode().iloc[0])

In [9]:
# look at the data again
print(data_mode_fill.isnull().sum())

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64


In [10]:
print(data_mode_fill.columns)

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country',
       'Timestamp', 'Clicked on Ad'],
      dtype='object')


In [11]:
# drop some unnecessary columns
x = data_mode_fill.iloc[:,0:7]
x = x.drop(['Ad Topic Line','City'],axis=1)
x

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male
0,68.95,35,61833.90,256.09,0.0
1,80.23,31,68441.85,193.77,1.0
2,69.47,26,59785.94,236.50,0.0
3,74.15,29,54806.18,245.89,1.0
4,68.37,35,73889.99,225.58,0.0
...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1.0
996,51.30,45,67782.17,134.42,1.0
997,62.26,51,42415.72,120.37,1.0
998,55.55,19,41920.79,187.95,0.0


In [12]:
y = data.iloc[:,9]
y

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Clicked on Ad, Length: 1000, dtype: int64

In [14]:
# Train & Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=4)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(700, 5)
(300, 5)
(700,)
(300,)


In [16]:
from sklearn.linear_model import LogisticRegression
Lr = LogisticRegression(C=0.01,random_state=0)
Lr.fit(x_train,y_train)
y_pred = Lr.predict(x_test)
print(y_pred)


[0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1
 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 1 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0
 1 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 0
 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0
 0 1 0 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0
 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 0 1 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 1
 1 0 0 0]


In [17]:
y_pred_proba = Lr.predict_proba(x_test)
print(y_pred_proba)

[[8.76228349e-01 1.23771651e-01]
 [9.08776762e-01 9.12232378e-02]
 [7.02886756e-01 2.97113244e-01]
 [3.04339735e-01 6.95660265e-01]
 [2.86745057e-02 9.71325494e-01]
 [8.08905438e-01 1.91094562e-01]
 [9.80008496e-01 1.99915039e-02]
 [1.72601389e-02 9.82739861e-01]
 [8.17015738e-01 1.82984262e-01]
 [4.75758787e-03 9.95242412e-01]
 [5.06546790e-01 4.93453210e-01]
 [5.77038714e-01 4.22961286e-01]
 [9.69819002e-01 3.01809976e-02]
 [2.84840070e-02 9.71515993e-01]
 [1.78226866e-03 9.98217731e-01]
 [4.96117081e-03 9.95038829e-01]
 [9.78705343e-01 2.12946569e-02]
 [4.02315416e-03 9.95976846e-01]
 [9.82439713e-01 1.75602869e-02]
 [9.18848811e-03 9.90811512e-01]
 [4.31908464e-02 9.56809154e-01]
 [8.74595835e-01 1.25404165e-01]
 [3.90767735e-01 6.09232265e-01]
 [4.95771324e-01 5.04228676e-01]
 [1.21996931e-01 8.78003069e-01]
 [9.32673209e-01 6.73267906e-02]
 [9.65529222e-01 3.44707778e-02]
 [5.01416391e-02 9.49858361e-01]
 [7.43140664e-01 2.56859336e-01]
 [9.84940307e-01 1.50596926e-02]
 [9.612719

In [20]:
from sklearn.metrics import accuracy_score, f1_score
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

0.8661971830985915
0.8733333333333333
