In [1]:
import pandas as pd
import numpy as np
import sklearn.ensemble
import seaborn as sns
# allow plots to appear within the note
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.font_manager

from scipy.stats import randint as sp_randint

from pprint import pprint
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support as score



In [2]:
def __default_exclusion():
    return ['page_id', 'page', 'category', 'user', 'label','content_token_edit_count_avg', 'content_token_vs_stop_words']

def drop_columns_gm(columns):
    drop_list = __generalized_model_exclusion()
    drop_list.extend(columns)
    return drop_list    

def drop_columns_fm(columns):
    drop_list = __full_model_exclusion()
    drop_list.extend(columns)
    return drop_list

def __full_model_exclusion():
    return __default_exclusion()

def __generalized_model_exclusion():
    exclusion = __default_exclusion()
    exclusion.extend(['page_talk_edits',
                        'tenure',
                        'ns1_edit_dist',
                        'ns2_edit_dist',
                        'ns3_edit_dist',
                        'ns4_edit_dist',
                        'ns5_edit_dist',
                        'ns6_edit_dist',
                        'ns7_edit_dist',
                        'ns8_edit_dist',
                        'ns9_edit_dist',
                        'ns10_edit_dist',
                        'ns11_edit_dist',
                        'ns12_edit_dist',
                        'ns13_edit_dist',
                        'ns14_edit_dist',
                        'ns15_edit_dist',
                        'total_edited_pages'])

    
    return exclusion
    
def get_metrics(classifier, x, y, cv):
    results = cross_val_score(classifier, x, y, cv=cv)
    
    accuracy = results.mean()
    precision = cross_val_score(classifier, x, y, scoring='precision', cv=cv).mean()
    recall = cross_val_score(classifier, x, y, scoring='recall', cv=cv).mean()
    f1 = cross_val_score(classifier, x, y, scoring='f1', cv=cv).mean()
    roc_auc = cross_val_score(classifier, x, y, scoring='roc_auc', cv=cv).mean()

    print(results)
    print('Accuracy: %.3f%%' % accuracy)
    print('Precision: %.3f%%' % precision) 
    print('Recall: %.3f%%' % recall)
    print('F1: %.3f%%' % f1)
    print('ROC AUC: %.3f%%' % roc_auc)
    print('\n')
    return [accuracy, precision, recall, f1, roc_auc]

In [3]:
# loading data (training set)
df = pd.read_csv('data/new_train_data.csv', header=0)
print('Total experts: {}'.format(len(df[df.label == 1])))
print('Total non-experts: {}'.format(len(df[df.label == 0])))

Total experts: 506
Total non-experts: 514


In [4]:
n_estimators = 160
random_state = 123
kfold = StratifiedKFold(n_splits=10, random_state=random_state)
model = XGBClassifier(objective='binary:logistic', seed=random_state, n_estimators=n_estimators)

In [5]:
df.drop(['edit_type_exists'], axis=1, inplace=True)
edit_types = [col for col in df.columns if str(col).startswith('edit_type')]
print(edit_types)
for edit_type in edit_types:
    df[edit_type].fillna(value=-1, inplace=True) 

['edit_type_a', 'edit_type_b', 'edit_type_c', 'edit_type_d', 'edit_type_e', 'edit_type_f', 'edit_type_g', 'edit_type_h', 'edit_type_i', 'edit_type_j', 'edit_type_k', 'edit_type_l', 'edit_type_m']


In [6]:
thresholds = np.arange(5)

columns = ['Page Edits Threshold', 'Total Examples', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']
rows = []
for threshold in thresholds:
    print('Processing threshold > {}'.format(threshold))
    tmp_df = df[df.page_edits > threshold]
    X = tmp_df.drop(__generalized_model_exclusion(), axis=1)
    y = tmp_df.label
    metrics = get_metrics(classifier=model, x=X, y=y, cv=kfold)
    
    row = ['page_edits > {}'.format(threshold), len(X)]
    for metric in metrics:
        row.append(metric)
    rows.append(row)

model_df = pd.DataFrame(rows, columns=columns)
model_df.head()

model_df.to_csv(r'data/new_page_edits_threshold_effect.csv', index=False)

Processing threshold > 0
[ 0.77669903  0.73786408  0.72815534  0.72815534  0.76470588  0.7254902
  0.71287129  0.69306931  0.79207921  0.74257426]
Accuracy: 0.740%
Precision: 0.735%
Recall: 0.751%
F1: 0.740%
ROC AUC: 0.833%


Processing threshold > 1
[ 0.66666667  0.71111111  0.77777778  0.71111111  0.73333333  0.77777778
  0.71111111  0.73333333  0.69767442  0.8372093 ]
Accuracy: 0.736%
Precision: 0.750%
Recall: 0.798%
F1: 0.770%
ROC AUC: 0.791%


Processing threshold > 2
[ 0.6         0.65517241  0.62068966  0.65517241  0.72413793  0.65517241
  0.65517241  0.85714286  0.67857143  0.85714286]
Accuracy: 0.696%
Precision: 0.740%
Recall: 0.781%
F1: 0.759%
ROC AUC: 0.781%


Processing threshold > 3
[ 0.66666667  0.61904762  0.61904762  0.76190476  0.8         0.8         0.8
  0.7         0.65        0.78947368]
Accuracy: 0.721%
Precision: 0.779%
Recall: 0.815%
F1: 0.791%
ROC AUC: 0.771%


Processing threshold > 4
[ 0.66666667  0.66666667  0.77777778  0.66666667  0.88235294  0.82352941
  