In [9]:
import csv 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB

In [10]:
df = pd.read_csv('data/LI_dataset.csv') 
df.shape

(108, 13)

In [11]:
#Target attribute
y = np.array(df['label'])
y.shape

(108,)

In [12]:
#Dependent attributes
X = np.array(df.drop(['label'],1))
X.shape

(108, 12)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Training instances : ", X_train.shape[0], 
     "\nTesting instances : ", X_test.shape[0])

Training instances :  81 
Testing instances :  27


In [16]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [18]:
y_pred = gnb.predict(X_test)
accuracy_score(y_test, y_pred)

0.8888888888888888

In [19]:
confusion_matrix(y_test, y_pred)

array([[19,  3],
       [ 0,  5]], dtype=int64)

### Extrapolate the dataset

In [20]:
mask = df['label'] == 1
fake_user_data = df[mask]
genuine_user_data = df[~mask]

In [21]:
fake_user_data = fake_user_data.append(fake_user_data)
fake_user_data = fake_user_data.append(fake_user_data)

In [22]:
fake_user_data.shape

(40, 13)

In [23]:
dataset = pd.concat([genuine_user_data, fake_user_data], axis = 0, sort = 'False')
dataset.shape

(138, 13)

In [24]:
dataset.isnull().values.any()

False

In [25]:
y = np.array(dataset['label'])
y.shape

(138,)

In [26]:
#Dependent attributes
X = np.array(dataset.drop(['label'],1))
X.shape

(138, 12)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [28]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
y_pred = gnb.predict(X_test)
accuracy_score(y_test, y_pred)

0.8

In [31]:
confusion_matrix(y_test, y_pred)

array([[19,  7],
       [ 0,  9]], dtype=int64)

In [32]:
y_test.shape

(35,)