In [221]:
import pandas as pd
import numpy as np
import seaborn as sns

In [206]:
train = pd.read_csv('./datasets/train.csv')
print(train.head())
print("Length: ", len(train))

  UniqueId  SocialMedia  SocialMediaTime  News  NewsTime  Blogs  BlogsTime  \
0   000x41          3.0           87.833   0.0       0.0   27.0    798.333   
1   000x4c         10.0         1005.667   0.0       0.0   36.0   2111.342   
2   000x65          4.0           61.000   0.0       0.0   19.0    607.000   
3   000xc6          0.0            0.000   0.0       0.0   17.0    840.233   
4   000xc7          3.0           94.000   2.0     125.0   55.0   1970.845   

   BounceRate  ExitRate  PageValue TrafficKind       OS        Browser Month  \
0       0.000     0.013     22.916      search  windows  Google-Chrome   Feb   
1       0.004     0.014     11.439      direct  windows          Other   Feb   
2       0.000     0.027     17.536    referral  android         Safari   Feb   
3       0.000     0.002    109.176      direct  windows  Google-Chrome   Mar   
4       0.000     0.002     96.255      direct  windows        Firefox   Mar   

               Region VisitorKind  DayImportance  

In [207]:
train = train.dropna()
len(train)

2821

In [208]:
train.isna().sum()

UniqueId           0
SocialMedia        0
SocialMediaTime    0
News               0
NewsTime           0
Blogs              0
BlogsTime          0
BounceRate         0
ExitRate           0
PageValue          0
TrafficKind        0
OS                 0
Browser            0
Month              0
Region             0
VisitorKind        0
DayImportance      0
IsWeekend          0
IsSale             0
dtype: int64

In [209]:
train = train.drop(['UniqueId'], axis=1)

In [210]:
train.describe()

Unnamed: 0,SocialMedia,SocialMediaTime,News,NewsTime,Blogs,BlogsTime,BounceRate,ExitRate,PageValue,DayImportance,IsWeekend
count,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0,2821.0
mean,2.716767,94.886645,0.605814,42.624563,37.519674,1434.944442,0.015788,0.034371,14.179981,0.044452,0.2609
std,3.537975,185.961218,1.329649,144.879136,50.000845,2018.829784,0.039431,0.041039,27.910909,0.172575,0.439204
min,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,10.0,285.5,0.0,0.012,0.0,0.0,0.0
50%,1.0,26.5,0.0,0.0,22.0,794.713,0.001,0.021,0.0,0.0,0.0
75%,4.0,116.0,1.0,0.0,44.0,1758.25,0.012,0.038,17.93,0.0,1.0
max,27.0,2720.5,12.0,1767.667,584.0,27009.859,0.2,0.2,360.953,1.0,1.0


In [211]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.losses = []
         
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray):
        epsilon = 1e-9
        y1 = y_true * np.log(y_pred + epsilon)
        y2 = (1-y_true) * np.log(1 - y_pred + epsilon)
        return -np.mean(y1 + y2)

    def feed_forward(self,X):
        z = np.dot(X, self.weights) + self.bias
        A = self._sigmoid(z)
        return A

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            A = self.feed_forward(X)
            self.losses.append(self.compute_loss(y,A))
            dz = A - y 
            
            dw = (1 / n_samples) * np.dot(X.T, dz)
            db = (1 / n_samples) * np.sum(dz)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
    def predict(self, X: np.ndarray):
        threshold = .5
        y_hat = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(y_hat)
        y_predicted_cls = [1 if i > threshold else 0 for i in y_predicted]
        
        return np.array(y_predicted_cls)

In [212]:
def standard_scaler(X: np.ndarray):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    
    return (X - mean) / std

In [213]:
X1_train = np.asarray(train['BlogsTime'].values.tolist())
X2_train = np.asarray(train['PageValue'].values.tolist())

X_train = np.column_stack((X1_train, X2_train))
X_train_scaled = standard_scaler(X_train)

y_train = np.asarray(train['IsSale'].values.tolist())

In [214]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [215]:
test = pd.read_csv('./datasets/test.csv')
test.isna().sum()

UniqueId            0
SocialMedia         2
SocialMediaTime     6
News                8
NewsTime            2
Blogs               4
BlogsTime           3
BounceRate         11
ExitRate            7
PageValue           3
TrafficKind         6
OS                  6
Browser             4
Month               5
Region              6
VisitorKind         4
DayImportance       5
IsWeekend           7
dtype: int64

In [216]:
len(test)

1000

In [217]:
test = test.dropna()
len(test)

973

In [218]:
X1_test = np.asarray(test['BlogsTime'].values.tolist())
X2_test = np.asarray(test['PageValue'].values.tolist())

X_test = np.column_stack((X1_test, X2_test))
X_test_scaled = standard_scaler(X_test)

In [219]:
predictions = model.predict(X_test_scaled)

In [225]:
# Write the submission to submission.csv
submission = test
submission['isSale'] = ["true" if i == 1 else "false" for i in predictions]
submission.to_csv("submission.csv")
print("Successfully written the predictions to submission.csv")

Unnamed: 0,UniqueId,SocialMedia,SocialMediaTime,News,NewsTime,Blogs,BlogsTime,BounceRate,ExitRate,PageValue,TrafficKind,OS,Browser,Month,Region,VisitorKind,DayImportance,IsWeekend,isSale
0,0x1de1,5.0,277.800,0.0,0.0,25.0,884.500,0.000,0.004,7.481,direct,windows,Other,Jul,Mid-Atlantic,New,0.0,1.0,false
1,0x21eb,7.0,190.438,0.0,0.0,60.0,1967.780,0.004,0.015,2.627,referral,mac-ios,Google-Chrome,Dec,Mountain,Returning,0.0,1.0,false
2,0x2f6a,0.0,0.000,0.0,0.0,5.0,586.000,0.000,0.060,0.000,referral,windows,Firefox,Nov,Pacific,Returning,0.0,1.0,false
3,0x25f0,2.0,88.000,10.0,719.0,54.0,1986.553,0.005,0.020,25.827,direct,android,Safari,Dec,Mid-Atlantic,Returning,0.0,0.0,true
4,0x2be0,12.0,1276.264,0.0,0.0,118.0,3215.173,0.000,0.005,22.458,search,mac-ios,Google-Chrome,Nov,Mountain,Returning,0.0,0.0,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0x123c,1.0,54.000,0.0,0.0,31.0,2543.500,0.000,0.007,0.000,direct,windows,Google-Chrome,May,South Atlantic,Returning,0.0,0.0,false
996,00x5e8,6.0,408.200,0.0,0.0,33.0,878.624,0.000,0.020,19.365,direct,windows,Google-Chrome,Mar,New England,Returning,0.0,0.0,false
997,0x19b7,6.0,189.800,0.0,0.0,9.0,208.833,0.000,0.015,70.316,direct,windows,Google-Chrome,Nov,East North Central,New,0.0,0.0,true
998,0x17b2,14.0,1220.914,4.0,1005.4,280.0,18171.795,0.017,0.039,5.698,search,windows,Google-Chrome,Jul,New England,Returning,0.0,0.0,true
