In [1]:
import pandas as pd
import numpy as np
import sklearn.ensemble
import seaborn as sns
# allow plots to appear within the note
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.font_manager

from scipy.stats import randint as sp_randint

from pprint import pprint
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support as score



In [2]:
def __default_exclusion():
    return ['page_id', 'page', 'category', 'user', 'label','content_token_edit_count_avg', 'content_token_vs_stop_words']

def drop_columns_gm(columns):
    drop_list = __generalized_model_exclusion()
    drop_list.extend(columns)
    return drop_list    

def drop_columns_fm(columns):
    drop_list = __full_model_exclusion()
    drop_list.extend(columns)
    return drop_list

def __full_model_exclusion():
    return __default_exclusion()

def __generalized_model_exclusion():
    exclusion = __default_exclusion()
    exclusion.extend(['page_talk_edits',
                        'tenure',
                        'ns1_edit_dist',
                        'ns2_edit_dist',
                        'ns3_edit_dist',
                        'ns4_edit_dist',
                        'ns5_edit_dist',
                        'ns6_edit_dist',
                        'ns7_edit_dist',
                        'ns8_edit_dist',
                        'ns9_edit_dist',
                        'ns10_edit_dist',
                        'ns11_edit_dist',
                        'ns12_edit_dist',
                        'ns13_edit_dist',
                        'ns14_edit_dist',
                        'ns15_edit_dist',
                        'total_edited_pages'])

    
    return exclusion
    
def get_metrics(classifier, x, y, cv):
    results = cross_val_score(classifier, x, y, cv=cv)
    
    accuracy = results.mean()
    precision = cross_val_score(classifier, x, y, scoring='precision', cv=cv).mean()
    recall = cross_val_score(classifier, x, y, scoring='recall', cv=cv).mean()
    f1 = cross_val_score(classifier, x, y, scoring='f1', cv=cv).mean()
    roc_auc = cross_val_score(classifier, x, y, scoring='roc_auc', cv=cv).mean()

    print(results)
    print('Accuracy: %.3f%%' % accuracy)
    print('Precision: %.3f%%' % precision) 
    print('Recall: %.3f%%' % recall)
    print('F1: %.3f%%' % f1)
    print('ROC AUC: %.3f%%' % roc_auc)
    print('\n')
    return [accuracy, precision, recall, f1, roc_auc]

In [3]:
# loading data (training set)
df = pd.read_csv('data/new_train_data.csv', header=0)
print('Total experts: {}'.format(len(df[df.label == 1])))
print('Total non-experts: {}'.format(len(df[df.label == 0])))

Total experts: 506
Total non-experts: 514


In [4]:
df.drop(['edit_type_exists'], axis=1, inplace=True)

df.rename(columns={'a': 'edit_type_a', 
                   'b': 'edit_type_b', 
                   'c': 'edit_type_c', 
                   'd': 'edit_type_d', 
                   'e': 'edit_type_e', 
                   'f': 'edit_type_f', 
                   'g': 'edit_type_g', 
                   'h': 'edit_type_h', 
                   'i': 'edit_type_i', 
                   'j': 'edit_type_j', 
                   'k': 'edit_type_k', 
                   'l': 'edit_type_l', 
                   'm': 'edit_type_m'}, inplace=True)

In [5]:
edit_types = [col for col in df.columns if str(col).startswith('edit_type')]
print(edit_types)

['edit_type_a', 'edit_type_b', 'edit_type_c', 'edit_type_d', 'edit_type_e', 'edit_type_f', 'edit_type_g', 'edit_type_h', 'edit_type_i', 'edit_type_j', 'edit_type_k', 'edit_type_l', 'edit_type_m']


In [6]:
for edit_type in edit_types:
    df[edit_type].fillna(value=-1, inplace=True)

In [7]:
n_estimators = 160
random_state = 123
kfold = StratifiedKFold(n_splits=10, random_state=random_state)
columns = ['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

X = df.drop(__generalized_model_exclusion(), axis=1)
y = df.label

svm = SVC(kernel='rbf', C=1.0, random_state=random_state)
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
gbc = GradientBoostingClassifier(n_estimators = n_estimators, random_state=random_state)
xgb = XGBClassifier(objective='binary:logistic', seed=random_state, n_estimators=n_estimators)

models = [svm, knn, rf, gbc, xgb]

rows = []
for model in models:
    model_name = type(model).__name__
    metrics = get_metrics(classifier=model, x=X, y=y, cv=kfold)
    
    row = [model_name]
    for metric in metrics:
        row.append(metric)
    rows.append(row)

model_df = pd.DataFrame(rows, columns=columns)
model_df.to_csv('data/new_algorithm_comparison_gm.csv', index=False)

[ 0.52427184  0.61165049  0.66990291  0.6407767   0.6372549   0.66666667
  0.66336634  0.68316832  0.61386139  0.6039604 ]
Accuracy: 0.631%
Precision: 0.588%
Recall: 0.866%
F1: 0.700%
ROC AUC: 0.650%


[ 0.51456311  0.60194175  0.51456311  0.61165049  0.56862745  0.60784314
  0.67326733  0.7029703   0.65346535  0.61386139]
Accuracy: 0.606%
Precision: 0.602%
Recall: 0.619%
F1: 0.609%
ROC AUC: 0.642%


[ 0.73786408  0.7184466   0.7184466   0.70873786  0.75490196  0.7254902
  0.72277228  0.81188119  0.78217822  0.72277228]
Accuracy: 0.740%
Precision: 0.734%
Recall: 0.757%
F1: 0.742%
ROC AUC: 0.832%


[ 0.75728155  0.66019417  0.72815534  0.78640777  0.76470588  0.79411765
  0.71287129  0.76237624  0.83168317  0.77227723]
Accuracy: 0.757%
Precision: 0.752%
Recall: 0.773%
F1: 0.759%
ROC AUC: 0.842%


[ 0.74757282  0.67961165  0.67961165  0.76699029  0.7745098   0.80392157
  0.74257426  0.75247525  0.8019802   0.76237624]
Accuracy: 0.751%
Precision: 0.747%
Recall: 0.763%
F1: 0.752%
ROC AUC: 

In [8]:
n_estimators = 160
random_state = 123
kfold = StratifiedKFold(n_splits=10, random_state=random_state)
columns = ['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

X = df.drop(__full_model_exclusion(), axis=1)
y = df.label

svm = SVC(kernel='rbf', C=1.0, random_state=random_state)
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
gbc = GradientBoostingClassifier(n_estimators = n_estimators, random_state=random_state)
xgb = XGBClassifier(objective='binary:logistic', seed=random_state, n_estimators=n_estimators)

models = [svm, knn, rf, gbc, xgb]

rows = []
for model in models:
    model_name = type(model).__name__
    metrics = get_metrics(classifier=model, x=X, y=y, cv=kfold)
    
    row = [model_name]
    for metric in metrics:
        row.append(metric)
    rows.append(row)

model_df = pd.DataFrame(rows, columns=columns)
model_df.to_csv('data/new_algorithm_comparison_fm.csv', index=False)

[ 0.55339806  0.60194175  0.6407767   0.65048544  0.62745098  0.64705882
  0.66336634  0.69306931  0.63366337  0.66336634]
Accuracy: 0.637%
Precision: 0.582%
Recall: 0.968%
F1: 0.726%
ROC AUC: 0.684%


[ 0.58252427  0.62135922  0.49514563  0.6407767   0.59803922  0.62745098
  0.69306931  0.72277228  0.67326733  0.62376238]
Accuracy: 0.628%
Precision: 0.617%
Recall: 0.660%
F1: 0.637%
ROC AUC: 0.668%


[ 0.75728155  0.74757282  0.7961165   0.78640777  0.83333333  0.79411765
  0.73267327  0.84158416  0.87128713  0.83168317]
Accuracy: 0.799%
Precision: 0.776%
Recall: 0.846%
F1: 0.807%
ROC AUC: 0.891%


[ 0.78640777  0.76699029  0.72815534  0.7961165   0.82352941  0.78431373
  0.77227723  0.82178218  0.89108911  0.83168317]
Accuracy: 0.800%
Precision: 0.791%
Recall: 0.820%
F1: 0.803%
ROC AUC: 0.883%


[ 0.77669903  0.76699029  0.74757282  0.7961165   0.82352941  0.81372549
  0.79207921  0.82178218  0.87128713  0.76237624]
Accuracy: 0.797%
Precision: 0.781%
Recall: 0.826%
F1: 0.801%
ROC AUC: