In [1]:
import csv 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import tree

In [2]:
df = pd.read_csv('data/LI_dataset.csv') 
df.shape

(108, 13)

In [3]:
#Target attribute
y = np.array(df['label'])
y.shape

(108,)

In [4]:
#Dependent attributes
X = np.array(df.drop(['label'],1))
X.shape

(108, 12)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Training instances : ", X_train.shape[0], 
     "\nTesting instances : ", X_test.shape[0])

Training instances :  81 
Testing instances :  27


In [6]:
clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.05)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.05, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [7]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8888888888888888

In [8]:
confusion_matrix(y_test, y_pred)

array([[21,  1],
       [ 2,  3]], dtype=int64)

### Extrapolate the dataset

In [9]:
mask = df['label'] == 1
fake_user_data = df[mask]
genuine_user_data = df[~mask]

In [10]:
fake_user_data = fake_user_data.append(fake_user_data)
fake_user_data = fake_user_data.append(fake_user_data)

In [11]:
fake_user_data.shape

(40, 13)

In [12]:
dataset = pd.concat([genuine_user_data, fake_user_data], axis = 0, sort = 'False')
dataset.shape

(138, 13)

In [13]:
dataset.isnull().values.any()

False

In [14]:
y = np.array(dataset['label'])
y.shape

(138,)

In [15]:
#Dependent attributes
X = np.array(dataset.drop(['label'],1))
X.shape

(138, 12)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.05)
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.05, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [18]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8285714285714286

In [19]:
confusion_matrix(y_test, y_pred)

array([[20,  6],
       [ 0,  9]], dtype=int64)

In [20]:
y_test.shape

(35,)