In [70]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC

In [72]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVC", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    LinearSVC(),
    SVC(kernel="linear", C=1),#0.025
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=3),
    MLPClassifier(alpha=0.001, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [73]:
classifiers_by_name = {names[i]:classifiers[i] for i in range(len(names))}

In [74]:
data_dir = 'plagiarism_data'

In [75]:

# read in test data, assuming it is stored locally
train_data = pd.read_csv(os.path.join(data_dir, "train.csv"), header=None, names=None)

# labels are in the first column
train_y = train_data.iloc[:,0]
train_x = train_data.iloc[:,1:]


# read in test data, assuming it is stored locally
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

In [80]:
len(train_y.values)

70

In [77]:
figure = plt.figure(figsize=(27, 9))
i = 1

<Figure size 1944x648 with 0 Axes>

In [78]:
for name, clf in zip(names, classifiers):
        clf.fit(train_x, train_y)
        score = clf.score(test_x, test_y)
        print(name,  score)
        
        # rounding and squeezing array
        test_preds = np.squeeze(np.round(clf.predict(test_x)))
    
        # calculate true positives, false positives, true negatives, false negatives
        tp = np.logical_and(test_y, test_preds).sum()
        fp = np.logical_and(1-test_y, test_preds).sum()
        tn = np.logical_and(1-test_y, 1-test_preds).sum()
        fn = np.logical_and(test_y, 1-test_preds).sum()
    
        # calculate binary classification metrics
        recall = tp / (tp + fn)
        precision = tp / (tp + fp)
        accuracy = (tp + tn) / (tp + fp + tn + fn)
    
        # printing a table of metrics
        print(name)
        print(pd.crosstab(test_y, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        


Nearest Neighbors 0.92
Nearest Neighbors
prediction (col)  0   1
actual (row)           
0                 8   2
1                 0  15

Recall:     1.000
Precision:  0.882
Accuracy:   0.920

Linear SVC 1.0
Linear SVC
prediction (col)   0   1
actual (row)            
0                 10   0
1                  0  15

Recall:     1.000
Precision:  1.000
Accuracy:   1.000

Linear SVM 0.96
Linear SVM
prediction (col)   0   1
actual (row)            
0                 10   0
1                  1  14

Recall:     0.933
Precision:  1.000
Accuracy:   0.960

RBF SVM 0.96
RBF SVM
prediction (col)   0   1
actual (row)            
0                 10   0
1                  1  14

Recall:     0.933
Precision:  1.000
Accuracy:   0.960

Gaussian Process 1.0
Gaussian Process
prediction (col)   0   1
actual (row)            
0                 10   0
1                  0  15

Recall:     1.000
Precision:  1.000
Accuracy:   1.000

Decision Tree 0.96
Decision Tree
prediction (col)  0   1
actual (row)  

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
