In [1]:
import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pylab as pl
from sklearn import svm

In [2]:
df = pd.read_csv('data/LI_dataset.csv') 
df.shape

(108, 13)

In [3]:
#Target attribute
y = np.array(df['label'])
y.shape

(108,)

In [4]:
#Dependent attributes
X = np.array(df.drop(['label'],1))
X.shape

(108, 12)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Training instances : ", X_train.shape[0], 
     "\nTesting instances : ", X_test.shape[0])

Training instances :  81 
Testing instances :  27


In [7]:
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(X_train, y_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9259259259259259

In [9]:
confusion_matrix(y_test, y_pred)

array([[22,  0],
       [ 2,  3]], dtype=int64)

### Extrapolate the dataset

In [10]:
mask = df['label'] == 1
fake_user_data = df[mask]
genuine_user_data = df[~mask]

In [11]:
fake_user_data = fake_user_data.append(fake_user_data)
fake_user_data = fake_user_data.append(fake_user_data)

In [12]:
fake_user_data.shape

(40, 13)

In [13]:
dataset = pd.concat([genuine_user_data, fake_user_data], axis = 0, sort = 'False')
dataset.shape

(138, 13)

In [14]:
dataset.isnull().values.any()

False

In [15]:
y = np.array(dataset['label'])
y.shape

(138,)

In [16]:
#Dependent attributes
X = np.array(dataset.drop(['label'],1))
X.shape

(138, 12)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Training instances : ", X_train.shape[0], 
     "\nTesting instances : ", X_test.shape[0])

Training instances :  103 
Testing instances :  35


In [19]:
clf = clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(X_train, y_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9142857142857143

In [22]:
confusion_matrix(y_test, y_pred)

array([[23,  3],
       [ 0,  9]], dtype=int64)

In [23]:
y_test.shape

(35,)