In [2]:
import lazypredict
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

#Read data file

import pandas as pd

filepath = "balanced_dataset_MASTER.csv"
df = pd.read_csv(filepath)
features = df

# Labels are the values we want to predict
labels = np.array(df['protection_level'])

# 0 => unprotected
# 1 => autoconfirmed
# 2 => extendedconfirmed
# 3 => sysop
labels_encoded = []
for item in labels:
    if(item =="unprotected"):
        labels_encoded.append(0)
    elif(item == "autoconfirmed"):
        labels_encoded.append(1)
    elif(item == "extendedconfirmed"):
        labels_encoded.append(2)
    elif(item == "sysop"):
        labels_encoded.append(3)  
labels_encoded

# Remove the labels from the features
features = features.drop('protection_level', axis = 1)
features = features.drop('page_title', axis = 1)
features = features.drop('protection_expiry', axis = 1)
# features = features.drop('page_id', axis = 1)

# Replace NaN
features = features.replace('Fewer than 30 watchers',np.NaN)
features = features.replace('There may or may not be a watching user visiting recent edits',np.NaN)

#Convert cols to Float
features['page_length'] = features['page_length'].astype(float)
features['total_edits'] = features['total_edits'].astype(float)
features['number_page_watchers'] = features['number_page_watchers'].astype(float)
features['number_page_watchers_recent_edits'] = features['number_page_watchers_recent_edits'].astype(float)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels



from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:07<00:00,  3.92it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      1.00               1.00    None      1.00   
DecisionTreeClassifier             1.00               1.00    None      1.00   
BaggingClassifier                  1.00               1.00    None      1.00   
RandomForestClassifier             1.00               1.00    None      1.00   
LGBMClassifier                     1.00               1.00    None      1.00   
ExtraTreesClassifier               0.99               0.99    None      0.99   
LogisticRegression                 0.98               0.98    None      0.98   
LinearDiscriminantAnalysis         0.98               0.98    None      0.98   
SVC                                0.97               0.97    None      0.97   
KNeighborsClassifier               0.95               0.95    None      0.95   
SGDClassifier                      0.94 




In [3]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=13, random_state=0, criterion='gini', oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

Accuracy: 99.8952%


In [9]:
import pickle 

filename = 'rfmodel.sav'

pickle.dump(rf, open(filename, 'wb'))

# Helper methods

In [39]:
def fetch_pageviews(title):
    import pageviewapi
    retry_count = 0
    MAX_RETRIES = 10
#     try:
    page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
#     except ConectionResetError as e:
#         if (retry_count == MAX_RETRIES):
#             raise e
#         time.sleep(5)
#         retry_count +=1
#         page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
    
#     except ZeroOrDataNotLoadedException:
#         print("Got ZeroOrDataNotLoadedException error")
#         pass
    
    view_counter = 0
    for i in range (0, len(page_views['items'])):
        view_counter += page_views['items'][i]['views']
    
    return view_counter

In [41]:
title = "Donald Trump"
page_views = fetch_pageviews(title)
print('Title:', title, 'View Count:',page_views)

Title: Donald Trump View Count: 510613501


In [74]:
def fetch_details_from_info_page(title):
    import requests
    url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title

    html_content = requests.get(url)
    df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list
    
    #Get Features from all tables

    #Basic info table
    try:
        display_title = df_list[1][1][0]
    except IndexError:
        print("IndexError for Basic info table, so skipping")
        return
    print("Display Title = ", display_title)

    # Process Table 1 - Basic Information
    dict_table1 = df_list[1].to_dict()
    
    #Declare vars
    page_length = ""
    page_id = ""
    number_page_watchers = ""
    number_page_watchers_recent_edits = ""
    page_views_past_30days = ""
    number_of_redirects = ""
    page_views_past_30days = ""
    total_edits = ""
    recent_number_of_edits = ""
    number_distinct_authors = ""
    number_categories = ""

    for key, value in dict_table1[0].items():  
        if value == 'Page length (in bytes)':        
            page_length = dict_table1[1][key]
            print("Page Length = ", page_length)
            
        elif (value == 'Page ID'):
            page_id = dict_table1[1][key]
            print("Scrapped Page ID = ", page_id)
            
        elif value == 'Number of page watchers':
            number_page_watchers = dict_table1[1][key]
            print("Number of Page Watchers = ", number_page_watchers)
        
        elif value == 'Number of page watchers who visited recent edits':
            number_page_watchers_recent_edits = dict_table1[1][key]
            print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits)
        
        elif value == 'Number of redirects to this page':
            number_of_redirects = dict_table1[1][key]
            print("Number of redirects = ", number_of_redirects)
        
        elif value == 'Page views in the past 30 days':
            page_views_past_30days = dict_table1[1][key]
            print("Page views past 30 days = ", page_views_past_30days)
        
    #Process Table 3 - Edit History
    try:
        dict_table3 = df_list[3].to_dict()
        for key, value in dict_table3[0].items():  
            if value == 'Total number of edits':        
                total_edits = dict_table3[1][key]
                print("Total Edits = ", total_edits)
                
            elif value == 'Recent number of edits (within past 30 days)':
                recent_number_of_edits = dict_table3[1][key]
                print("Recent number of edits = ", recent_number_of_edits)
                
            elif value == 'Recent number of distinct authors':
                number_distinct_authors = dict_table3[1][key]
                print("Distinct authors =", number_distinct_authors)
    except IndexError:
        print("Couldn't find the Edit History Table, so skipping...")
        pass

    #Page properties Table
    try:
        categories_string = df_list[4][0][0]
        print(categories_string)
        number_categories = ""
        if  categories_string.startswith("Hidden categories"):         
            #Get number of categories
            for c in categories_string:
                if c.isdigit():
                    number_categories = number_categories + c     
            
            print("Total number of categories = ", number_categories)
    except IndexError:
        print("Couldn't find the Page Properties Table, so skipping...")
        pass

    print("============================================== EOP ======================================")

    features_dict = {   'page_length': page_length, 
                        'page_id': page_id, 
                        'number_page_watchers': number_page_watchers, 
                        'number_page_watchers_recent_edits': number_page_watchers_recent_edits, 
                        'number_of_redirects' : number_of_redirects, 
                        'page_views_past_30days' :page_views_past_30days, 
                        'total_edits': total_edits, 
                        'recent_number_of_edits': recent_number_of_edits, 
                        'number_distinct_authors': number_distinct_authors, 
                        'number_categories': number_categories }

    return features_dict

In [46]:
features_dict = fetch_details_from_info_page(title)
print(features_dict)

Display Title =  Donald Trump
Page Length =  425815
Scrapped Page ID =  4848272
Number of Page Watchers =  3281
Number of Page Watchers with recent edits =  423
Number of redirects =  111
Page views past 30 days =  674919
Total Edits =  36669
Recent number of edits =  161
Distinct authors = 45
Hidden categories (46)
Total number of categories =  46
{'page_length': '425815', 'page_id': '4848272', 'number_page_watchers': '3281', 'number_page_watchers_recent_edits': '423', 'number_of_redirects': '111', 'page_views_past_30days': '674919', 'total_edits': '36669', 'recent_number_of_edits': '161', 'number_distinct_authors': '45', 'number_categories': '46'}


In [78]:
# MAP page_views and features_dict to np input array

def mapping_function(page_views, features_dict):

    features_of_test_sample = np.empty([12,])

    features_of_test_sample[0] = features_dict['page_id']
    features_of_test_sample[1] = page_views
    features_of_test_sample[2] = features_dict['page_length']
    features_of_test_sample[3] = features_dict['number_page_watchers']
    features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits']
    features_of_test_sample[5] = features_dict['number_of_redirects']
    features_of_test_sample[6] = features_dict['page_views_past_30days']
    features_of_test_sample[7] = features_dict['total_edits']
    features_of_test_sample[8] = features_dict['recent_number_of_edits']
    features_of_test_sample[9] = features_dict['number_distinct_authors']
    features_of_test_sample[10] = features_dict['number_categories']
    features_of_test_sample[11] = features_dict['page_id']
    
    wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id'])
    
    return features_of_test_sample, wikipedia_url

In [71]:
features_of_test_sample[6]

674919.0

In [48]:
feature_list

['page_id',
 'view_count',
 'page_length',
 'number_page_watchers',
 'number_page_watchers_recent_edits',
 'number_of_redirects',
 'page_views_past_30days',
 'total_edits',
 'recent_number_of_edits',
 'number_distinct_authors',
 'number_categories',
 'page_freshness']

In [53]:
X_test[0]

array([5.0359e+04, 2.7110e+04, 6.2000e+01, 2.8300e+02, 2.0380e+02,
       0.0000e+00, 1.4000e+02, 5.8000e+01, 0.0000e+00, 0.0000e+00,
       7.4000e+00, 2.3010e+03])

# Wrap the model into a function


In [77]:
def get_features(title):
    #Get pageview
    page_views = fetch_pageviews(title)
    print('Tilte:', title, 'View Count:',page_views)
    
    #Get features from info pages 
    features_dict = fetch_details_from_info_page(title)
    
    #MAP both to numpy array
    features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict)
    
    return features_of_test_sample, wikipedia_url

In [76]:
features_of_test_sample = get_features("Donald Trump")
features_of_test_sample

Tilte: Donald Trump View Count: 510613501
Display Title =  Donald Trump
Page Length =  425815
Scrapped Page ID =  4848272
Number of Page Watchers =  3281
Number of Page Watchers with recent edits =  423
Number of redirects =  111
Page views past 30 days =  674919
Total Edits =  36669
Recent number of edits =  161
Distinct authors = 45
Hidden categories (46)
Total number of categories =  46


array([4.84827200e+06, 5.10613501e+08, 4.25815000e+05, 3.28100000e+03,
       4.23000000e+02, 1.11000000e+02, 6.74919000e+05, 3.66690000e+04,
       1.61000000e+02, 4.50000000e+01, 4.60000000e+01, 4.84827200e+06])

In [84]:
def predict_protection_level(title):
    import pickle
    features_of_test_sample, wikipedia_url = get_features(title)
    print("Page URL: ", wikipedia_url)
    
    #Load the model
    filename = 'rfmodel.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    
    #predict    
    #print("Features 1st row:", X_test[0])
    y_pred = loaded_model.predict([features_of_test_sample])
    
    print("Predicted protection_level: ", y_pred[0])
    
    predicted_protection_level = y_pred    
    
    if(predicted_protection_level == 0):
        predicted_protection_level_str = "unprotected"
    elif(predicted_protection_level == 1):
        predicted_protection_level_str = "autoconfirmed"
    elif(predicted_protection_level == 2):
        predicted_protection_level_str = "extendedconfirmed"
    elif(predicted_protection_level == 3):
        predicted_protection_level_str = "sysop"
        
    print (predicted_protection_level_str)
    
    #Return the predicted value
    return predicted_protection_level_str

In [85]:
predicted_protection_level_str = predict_protection_level("Donald Trump")[0]
print("Protection level:", predicted_protection_level_str)

Tilte: Donald Trump View Count: 510613501
Display Title =  Donald Trump
Page Length =  425815
Scrapped Page ID =  4848272
Number of Page Watchers =  3281
Number of Page Watchers with recent edits =  423
Number of redirects =  111
Page views past 30 days =  674919
Total Edits =  36669
Recent number of edits =  161
Distinct authors = 45
Hidden categories (46)
Total number of categories =  46
Page URL:  https://en.wikipedia.org/?curid=4848272
Predicted protection_level:  2
extendedconfirmed
Protection level: e


In [87]:
def func(str):
    return str + " hello"

func("hi")

'hi hello'

# Apply KNNImputer

In [276]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:11<00:00,  2.57it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.79               0.79    None      0.79   
XGBClassifier                      0.79               0.78    None      0.78   
RandomForestClassifier             0.78               0.78    None      0.78   
ExtraTreesClassifier               0.78               0.77    None      0.77   
BaggingClassifier                  0.76               0.76    None      0.76   
DecisionTreeClassifier             0.71               0.71    None      0.71   
NuSVC                              0.67               0.67    None      0.67   
LogisticRegression                 0.66               0.66    None      0.65   
CalibratedClassifierCV             0.65               0.65    None      0.64   
AdaBoostClassifier                 0.65               0.65    None      0.64   
SVC                                0.63 




In [248]:
# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=13, random_state=0, criterion='gini', oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [52]:
type(features)

numpy.ndarray

In [53]:
features

array([[4.43372000e+06, 2.24270000e+04, 7.60000000e+01, 9.00000000e+00,
        4.00000000e+00, 8.01150000e+04, 8.64000000e+02, 1.00000000e+00,
        1.00000000e+00, 2.00000000e+01],
       [3.64853500e+06, 2.48970000e+04, 8.10000000e+01, 1.20000000e+01,
        0.00000000e+00, 5.50190000e+04, 9.62000000e+02, 8.00000000e+00,
        4.00000000e+00, 2.60000000e+01],
       [3.15518300e+06, 3.31190000e+04, 2.44000000e+02, 2.00000000e+01,
        1.40000000e+01, 4.63500000e+04, 9.46000000e+02, 7.00000000e+00,
        6.00000000e+00, 1.50000000e+01],
       [3.15119700e+06, 4.26500000e+03, 3.40000000e+01, 8.00000000e+00,
        0.00000000e+00, 2.63920000e+04, 6.00000000e+01, 0.00000000e+00,
        0.00000000e+00, 6.00000000e+00],
       [2.88526600e+06, 1.80810000e+04, 9.50000000e+01, 1.20000000e+01,
        0.00000000e+00, 1.07570000e+04, 1.93000000e+02, 0.00000000e+00,
        0.00000000e+00, 1.50000000e+01],
       [2.43098900e+06, 4.80000000e+01, 3.32000000e+02, 3.20000000e+01,
   

In [57]:
type(labels_encoded)

list

In [58]:
np.savetxt('features.out', features, delimiter=',') 

In [59]:
labels_np = np.array(labels_encoded)

In [60]:
np.savetxt('labels.out', labels_np, delimiter=',')

In [61]:
import tensorflow as tf

In [63]:
from tensorflow import keras

In [109]:
# define the model

from keras.models import Sequential
from keras.layers import Dense


model = Sequential()
model.add(Dense(32, input_dim=10, kernel_initializer='he_uniform', activation='relu'))

model.add(Dense(16, activation='relu'))

model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [110]:
u_labels = list(set(labels))
one_hot_encoded = []
for item in labels:
    temp = [0] * len(u_labels)
    temp[u_labels.index(item)] = 1
    one_hot_encoded.append(temp)
one_hot_encoded = np.array(one_hot_encoded)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(features, one_hot_encoded, test_size =0.20, random_state = 53)


# Apply minmax scaler

In [159]:
from sklearn.preprocessing import MinMaxScaler
scalermm = MinMaxScaler()

X_train_mm = scalermm.fit_transform(X_train)
X_test_mm = scalermm.fit_transform(X_test)

In [171]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_mm, X_test_mm, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.23it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreesClassifier               0.74               0.74    None      0.74   
RandomForestClassifier             0.73               0.73    None      0.73   
XGBClassifier                      0.70               0.70    None      0.70   
LGBMClassifier                     0.68               0.69    None      0.68   
NuSVC                              0.66               0.66    None      0.65   
SVC                                0.64               0.64    None      0.63   
BaggingClassifier                  0.63               0.63    None      0.64   
LogisticRegression                 0.61               0.62    None      0.58   
AdaBoostClassifier                 0.61               0.60    None      0.60   
KNeighborsClassifier               0.59               0.59    None      0.59   
CalibratedClassifierCV             0.57 




# Outlier detection and removal

In [188]:
df

Unnamed: 0,page_id,page_title,view_count,protection_level,protection_expiry,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,340663,Paul Reiser,4433720.00,unprotected,,22427,76,9,4,80115,864.00,1.00,1.00,20.00
1,667049,Meg Tilly,3648535.00,unprotected,,24897,81,12,0,55019,962.00,8.00,4.00,26.00
2,2736939,Random number generation,3155183.00,unprotected,,33119,244,20,14,46350,946.00,7.00,6.00,15.00
3,39068821,Paige Howard,3151197.00,unprotected,,4265,34,8,0,26392,60.00,0.00,0.00,6.00
4,41414319,Sophia Amoruso,2885266.00,unprotected,,18081,Fewer than 30 watchers,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4762,3120522,Kamala Harris,65958868.00,extendedconfirmed,infinity,229220,754,194,59,260693,6563.00,14.00,9.00,26.00
4763,225891,Kamma (caste),1107024.00,extendedconfirmed,infinity,40783,107,5,6,17785,4432.00,0.00,0.00,11.00
4764,2340298,Kannur,938453.00,extendedconfirmed,2022-01-15T21:49:52Z,55720,91,9,12,9383,2438.00,1.00,1.00,21.00
4765,865032,Kannur district,410880.00,extendedconfirmed,2022-01-15T20:08:42Z,51713,44,7,7,5147,1783.00,12.00,2.00,15.00


In [189]:
from scipy import stats
df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

TypeError: can only concatenate str (not "float") to str

In [190]:
dropped_cols = ['page_id', 'page_title', 'protection_level', 'protection_expiry']
new_df = df.drop(dropped_cols, axis=1)

In [191]:
new_df

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.00,22427,76,9,4,80115,864.00,1.00,1.00,20.00
1,3648535.00,24897,81,12,0,55019,962.00,8.00,4.00,26.00
2,3155183.00,33119,244,20,14,46350,946.00,7.00,6.00,15.00
3,3151197.00,4265,34,8,0,26392,60.00,0.00,0.00,6.00
4,2885266.00,18081,Fewer than 30 watchers,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.00,229220,754,194,59,260693,6563.00,14.00,9.00,26.00
4763,1107024.00,40783,107,5,6,17785,4432.00,0.00,0.00,11.00
4764,938453.00,55720,91,9,12,9383,2438.00,1.00,1.00,21.00
4765,410880.00,51713,44,7,7,5147,1783.00,12.00,2.00,15.00


In [192]:
new_df = new_df.replace('Fewer than 30 watchers',np.NaN)

new_df = new_df.replace('There may or may not be a watching user visiting recent edits',np.NaN)

In [193]:
new_df

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.00,22427,76,9,4,80115,864.00,1.00,1.00,20.00
1,3648535.00,24897,81,12,0,55019,962.00,8.00,4.00,26.00
2,3155183.00,33119,244,20,14,46350,946.00,7.00,6.00,15.00
3,3151197.00,4265,34,8,0,26392,60.00,0.00,0.00,6.00
4,2885266.00,18081,,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.00,229220,754,194,59,260693,6563.00,14.00,9.00,26.00
4763,1107024.00,40783,107,5,6,17785,4432.00,0.00,0.00,11.00
4764,938453.00,55720,91,9,12,9383,2438.00,1.00,1.00,21.00
4765,410880.00,51713,44,7,7,5147,1783.00,12.00,2.00,15.00


In [194]:
new_df['page_length'] = new_df['page_length'].astype(float)
new_df['total_edits'] = new_df['total_edits'].astype(float)
new_df['number_page_watchers'] = new_df['number_page_watchers'].astype(float)
new_df['number_page_watchers_recent_edits'] = new_df['number_page_watchers_recent_edits'].astype(float)

In [196]:
new_df

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.00,22427.00,76.00,9.00,4,80115,864.00,1.00,1.00,20.00
1,3648535.00,24897.00,81.00,12.00,0,55019,962.00,8.00,4.00,26.00
2,3155183.00,33119.00,244.00,20.00,14,46350,946.00,7.00,6.00,15.00
3,3151197.00,4265.00,34.00,8.00,0,26392,60.00,0.00,0.00,6.00
4,2885266.00,18081.00,,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.00,229220.00,754.00,194.00,59,260693,6563.00,14.00,9.00,26.00
4763,1107024.00,40783.00,107.00,5.00,6,17785,4432.00,0.00,0.00,11.00
4764,938453.00,55720.00,91.00,9.00,12,9383,2438.00,1.00,1.00,21.00
4765,410880.00,51713.00,44.00,7.00,7,5147,1783.00,12.00,2.00,15.00


In [200]:
for item in new_df.iloc[0]:
    print(type(item))

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>


In [209]:
new_df[(np.abs(stats.zscore(new_df)) < 0).all(axis=1)]

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories


In [205]:
new_df

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.00,22427.00,76.00,9.00,4,80115,864.00,1.00,1.00,20.00
1,3648535.00,24897.00,81.00,12.00,0,55019,962.00,8.00,4.00,26.00
2,3155183.00,33119.00,244.00,20.00,14,46350,946.00,7.00,6.00,15.00
3,3151197.00,4265.00,34.00,8.00,0,26392,60.00,0.00,0.00,6.00
4,2885266.00,18081.00,,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.00,229220.00,754.00,194.00,59,260693,6563.00,14.00,9.00,26.00
4763,1107024.00,40783.00,107.00,5.00,6,17785,4432.00,0.00,0.00,11.00
4764,938453.00,55720.00,91.00,9.00,12,9383,2438.00,1.00,1.00,21.00
4765,410880.00,51713.00,44.00,7.00,7,5147,1783.00,12.00,2.00,15.00


In [210]:
new_df[(np.abs(stats.zscore(new_df[0])) < 3)]

KeyError: 0

In [211]:
new_df

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.00,22427.00,76.00,9.00,4,80115,864.00,1.00,1.00,20.00
1,3648535.00,24897.00,81.00,12.00,0,55019,962.00,8.00,4.00,26.00
2,3155183.00,33119.00,244.00,20.00,14,46350,946.00,7.00,6.00,15.00
3,3151197.00,4265.00,34.00,8.00,0,26392,60.00,0.00,0.00,6.00
4,2885266.00,18081.00,,,0,10757,193.00,0.00,0.00,15.00
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.00,229220.00,754.00,194.00,59,260693,6563.00,14.00,9.00,26.00
4763,1107024.00,40783.00,107.00,5.00,6,17785,4432.00,0.00,0.00,11.00
4764,938453.00,55720.00,91.00,9.00,12,9383,2438.00,1.00,1.00,21.00
4765,410880.00,51713.00,44.00,7.00,7,5147,1783.00,12.00,2.00,15.00


In [212]:
new_df['view_count'].min()

1.0

In [213]:
new_df['view_count'].max()

510613501.0

In [214]:
new_df.head()

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.0,22427.0,76.0,9.0,4,80115,864.0,1.0,1.0,20.0
1,3648535.0,24897.0,81.0,12.0,0,55019,962.0,8.0,4.0,26.0
2,3155183.0,33119.0,244.0,20.0,14,46350,946.0,7.0,6.0,15.0
3,3151197.0,4265.0,34.0,8.0,0,26392,60.0,0.0,0.0,6.0
4,2885266.0,18081.0,,,0,10757,193.0,0.0,0.0,15.0


In [216]:
for col in new_df.columns:
    print(col, "|", "MIN:", new_df[col].min(), "|", "MAX:", new_df[col].max())    

view_count | MIN: 1.0 | MAX: 510613501.0
page_length | MIN: 0.0 | MAX: 424753.0
number_page_watchers | MIN: 30.0 | MAX: 4756.0
number_page_watchers_recent_edits | MIN: 2.0 | MAX: 1001.0
number_of_redirects | MIN: 0 | MAX: 200
page_views_past_30days | MIN: 0 | MAX: 4130509
total_edits | MIN: 1.0 | MAX: 36581.0
recent_number_of_edits | MIN: 0.0 | MAX: 554.0
number_distinct_authors | MIN: 0.0 | MAX: 116.0
number_categories | MIN: 2.0 | MAX: 89.0


In [219]:
#Create train test split
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)



# Isolation Forest - outlier removal trial

In [256]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:10<00:00,  2.71it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.79               0.79    None      0.79   
XGBClassifier                      0.79               0.78    None      0.78   
RandomForestClassifier             0.78               0.78    None      0.78   
ExtraTreesClassifier               0.78               0.77    None      0.77   
BaggingClassifier                  0.76               0.76    None      0.76   
DecisionTreeClassifier             0.71               0.71    None      0.71   
NuSVC                              0.67               0.67    None      0.67   
LogisticRegression                 0.66               0.66    None      0.65   
CalibratedClassifierCV             0.65               0.65    None      0.64   
AdaBoostClassifier                 0.65               0.65    None      0.64   
SVC                                0.63 




In [257]:
from sklearn.ensemble import IsolationForest
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

In [259]:
mask = yhat != -1
y_train = np.array(y_train)
X_train, y_train = X_train[mask, :], y_train[mask]

In [235]:
X_train.shape

(3431, 10)

In [236]:
y_train.shape

(3431,)

In [260]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:09<00:00,  2.91it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
RandomForestClassifier             0.78               0.77    None      0.77   
ExtraTreesClassifier               0.78               0.77    None      0.77   
LGBMClassifier                     0.77               0.77    None      0.77   
XGBClassifier                      0.77               0.77    None      0.77   
BaggingClassifier                  0.77               0.77    None      0.77   
DecisionTreeClassifier             0.69               0.69    None      0.69   
AdaBoostClassifier                 0.68               0.68    None      0.68   
LogisticRegression                 0.68               0.68    None      0.67   
NuSVC                              0.67               0.67    None      0.67   
CalibratedClassifierCV             0.66               0.66    None      0.65   
SVC                                0.66 




In [261]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:11<00:00,  2.60it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.79               0.79    None      0.79   
XGBClassifier                      0.79               0.78    None      0.78   
RandomForestClassifier             0.78               0.78    None      0.78   
ExtraTreesClassifier               0.78               0.77    None      0.77   
BaggingClassifier                  0.76               0.76    None      0.76   
DecisionTreeClassifier             0.71               0.71    None      0.71   
NuSVC                              0.67               0.67    None      0.67   
LogisticRegression                 0.66               0.66    None      0.65   
CalibratedClassifierCV             0.65               0.65    None      0.64   
AdaBoostClassifier                 0.65               0.65    None      0.64   
SVC                                0.63 




In [239]:
# Get outliers
mask = (yhat == -1)
X_train_outliers, y_train_outliers = X_train[mask, :], y_train_np_array[mask]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 3431 but corresponding boolean dimension is 3813

In [243]:
type(yhat)

numpy.ndarray

In [244]:
from collections import Counter

Counter(yhat)

Counter({1: 3431, -1: 382})

In [None]:
from sklearn.ensemble import EllipticEnvelope
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

In [262]:
X_train.shape

(3813, 11)

In [269]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:10<00:00,  2.64it/s]


In [267]:
y_test

[3,
 1,
 3,
 1,
 1,
 3,
 3,
 2,
 1,
 0,
 0,
 2,
 0,
 0,
 1,
 3,
 1,
 1,
 3,
 2,
 1,
 0,
 2,
 3,
 2,
 1,
 3,
 2,
 3,
 0,
 0,
 1,
 1,
 3,
 3,
 1,
 3,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 2,
 3,
 1,
 2,
 2,
 3,
 0,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 1,
 3,
 1,
 0,
 1,
 2,
 1,
 2,
 3,
 3,
 1,
 1,
 3,
 1,
 2,
 1,
 0,
 1,
 2,
 2,
 3,
 0,
 1,
 3,
 0,
 0,
 1,
 1,
 2,
 0,
 3,
 3,
 1,
 0,
 3,
 0,
 0,
 1,
 1,
 1,
 2,
 0,
 3,
 2,
 2,
 0,
 3,
 2,
 3,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 3,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 3,
 1,
 0,
 3,
 0,
 3,
 0,
 3,
 3,
 3,
 0,
 1,
 1,
 0,
 3,
 0,
 0,
 1,
 3,
 0,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 2,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 2,
 0,
 3,
 3,
 2,
 3,
 0,
 3,
 1,
 1,
 0,
 2,
 0,
 2,
 1,
 3,
 0,
 3,
 0,
 3,
 2,
 2,
 1,
 0,
 1,
 1,
 3,
 0,
 2,
 2,
 0,
 2,
 3,
 3,
 1,
 1,
 2,
 2,
 3,
 1,
 2,
 0,
 1,
 1,
 3,
 3,
 3,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 3,
 0,
 1,
 3,
 0,
 0,
 2,
 3,
 1,
 2,
 0,
 2,
 2,
 3,
 1,
 3,
 1,
 1,


In [270]:
# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=13, random_state=0, criterion='gini', oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.25%


In [272]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.79,0.79,,0.79,0.47
XGBClassifier,0.79,0.78,,0.78,0.69
RandomForestClassifier,0.78,0.78,,0.78,0.98
ExtraTreesClassifier,0.78,0.77,,0.77,0.67
BaggingClassifier,0.76,0.76,,0.76,0.36
DecisionTreeClassifier,0.71,0.71,,0.71,0.07
NuSVC,0.67,0.67,,0.67,0.9
LogisticRegression,0.66,0.66,,0.65,0.22
CalibratedClassifierCV,0.65,0.65,,0.64,3.15
AdaBoostClassifier,0.65,0.65,,0.64,0.37


In [274]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.81,0.8,,0.8,0.87
LGBMClassifier,0.8,0.8,,0.8,0.62
XGBClassifier,0.8,0.8,,0.8,0.73
ExtraTreesClassifier,0.79,0.78,,0.78,0.62
BaggingClassifier,0.75,0.75,,0.75,0.27
DecisionTreeClassifier,0.71,0.71,,0.71,0.06
KNeighborsClassifier,0.7,0.7,,0.7,0.1
NuSVC,0.69,0.69,,0.69,0.81
LogisticRegression,0.67,0.67,,0.66,0.24
AdaBoostClassifier,0.67,0.67,,0.66,0.37


In [298]:
# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=15, random_state=0, oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

Accuracy: 79.8742%


In [299]:
y_pred

array([3, 3, 3, 2, 2, 3, 3, 2, 1, 0, 0, 2, 0, 2, 1, 3, 1, 1, 3, 1, 0, 0,
       2, 3, 0, 3, 3, 2, 3, 0, 0, 1, 1, 3, 3, 1, 3, 0, 2, 1, 0, 1, 1, 1,
       1, 0, 1, 2, 3, 1, 1, 1, 3, 0, 1, 3, 1, 0, 2, 1, 1, 2, 3, 3, 1, 0,
       1, 2, 1, 2, 3, 3, 1, 1, 3, 1, 2, 1, 0, 2, 2, 2, 3, 0, 1, 3, 0, 2,
       1, 1, 2, 0, 3, 3, 1, 1, 3, 0, 0, 1, 3, 1, 1, 0, 3, 1, 2, 0, 3, 3,
       3, 1, 1, 1, 2, 0, 0, 0, 3, 0, 0, 2, 3, 0, 2, 0, 0, 1, 1, 0, 0, 1,
       2, 1, 0, 2, 2, 1, 1, 0, 2, 2, 3, 1, 2, 3, 0, 3, 0, 3, 3, 3, 0, 1,
       1, 2, 3, 0, 0, 1, 3, 2, 2, 1, 2, 1, 1, 3, 2, 2, 1, 2, 1, 0, 1, 1,
       2, 2, 2, 0, 3, 3, 0, 3, 0, 3, 1, 3, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3,
       2, 2, 1, 0, 1, 0, 3, 0, 2, 2, 0, 2, 3, 3, 1, 1, 2, 0, 3, 1, 0, 0,
       1, 1, 3, 3, 3, 1, 2, 0, 0, 1, 3, 1, 3, 0, 1, 3, 0, 0, 2, 3, 1, 1,
       0, 2, 2, 3, 1, 3, 2, 0, 1, 3, 2, 1, 2, 2, 0, 0, 3, 1, 1, 3, 2, 2,
       1, 3, 1, 1, 1, 2, 1, 3, 3, 1, 2, 2, 1, 1, 3, 3, 3, 1, 1, 2, 0, 3,
       3, 3, 3, 0, 3, 2, 2, 2, 3, 3, 2, 1, 2, 2, 2,

In [303]:
for pred, gt in zip(y_pred, y_test):
    if(pred != gt):
        print("Mismatch occured at: ", y_test.index(pred))
        

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [308]:
list_mismatches = []
counter_mismatches = 0
for i in range(0, len(y_pred)):
    if(y_pred[i] != y_test[i]):
        print("Mismatch occured at: ", i)
        list_mismatches.append(i)
        counter_mismatches += 1
    

Mismatch occured at:  1
Mismatch occured at:  3
Mismatch occured at:  4
Mismatch occured at:  13
Mismatch occured at:  19
Mismatch occured at:  20
Mismatch occured at:  24
Mismatch occured at:  25
Mismatch occured at:  37
Mismatch occured at:  38
Mismatch occured at:  39
Mismatch occured at:  50
Mismatch occured at:  51
Mismatch occured at:  55
Mismatch occured at:  62
Mismatch occured at:  79
Mismatch occured at:  87
Mismatch occured at:  95
Mismatch occured at:  100
Mismatch occured at:  102
Mismatch occured at:  105
Mismatch occured at:  109
Mismatch occured at:  111
Mismatch occured at:  121
Mismatch occured at:  124
Mismatch occured at:  135
Mismatch occured at:  136
Mismatch occured at:  144
Mismatch occured at:  155
Mismatch occured at:  161
Mismatch occured at:  162
Mismatch occured at:  164
Mismatch occured at:  171
Mismatch occured at:  172
Mismatch occured at:  175
Mismatch occured at:  176
Mismatch occured at:  182
Mismatch occured at:  187
Mismatch occured at:  196
Mismatc

In [309]:
len(y_pred)

954

In [310]:
counter_mismatches

192

In [312]:
for item in list_mismatches:
    print(item)
    print(X_test[item])

1
[1.922933e+07 1.779500e+04 1.480000e+02 7.200000e+01 6.160000e+01
 0.000000e+00 0.000000e+00 3.000000e+01 0.000000e+00 0.000000e+00
 5.000000e+00]
3
[6.3415398e+07 3.3210000e+03 7.3000000e+01 6.3000000e+01 8.0000000e+00
 0.0000000e+00 2.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
 3.0000000e+00]
4
[2.8269389e+07 4.2530000e+04 2.9222000e+04 9.7600000e+01 4.4000000e+01
 2.0000000e+00 5.5700000e+02 3.3200000e+02 0.0000000e+00 0.0000000e+00
 9.0000000e+00]
13
[3.572726e+06 2.232700e+04 5.220000e+03 3.660000e+01 9.600000e+00
 3.000000e+00 1.220000e+02 9.000000e+01 0.000000e+00 0.000000e+00
 8.000000e+00]
19
[4.5358157e+07 2.8013900e+05 1.4236000e+04 1.0440000e+02 2.3400000e+01
 3.0000000e+00 1.0908000e+04 1.2800000e+02 0.0000000e+00 0.0000000e+00
 7.0000000e+00]
20
[3.4747225e+07 5.9570000e+03 1.0230000e+03 4.3800000e+01 1.0800000e+01
 1.0000000e+00 6.2000000e+01 4.0000000e+00 0.0000000e+00 0.0000000e+00
 4.0000000e+00]
24
[6.980783e+06 7.211400e+04 7.799000e+03 4.460000e+01 1.2

In [319]:
import lazypredict
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

#Read data file

import pandas as pd

filepath = "balanced_dataset_MASTER.csv"
df = pd.read_csv(filepath)
features = df

# Labels are the values we want to predict
labels = np.array(df['protection_level'])

# 0 => unprotected
# 1 => autoconfirmed
# 2 => extendedconfirmed
# 3 => sysop
labels_encoded = []
for item in labels:
    if(item =="unprotected"):
        labels_encoded.append(0)
    elif(item == "autoconfirmed"):
        labels_encoded.append(1)
    elif(item == "extendedconfirmed"):
        labels_encoded.append(2)
    elif(item == "sysop"):
        labels_encoded.append(3)  
labels_encoded

# Remove the labels from the features
features = features.drop('protection_level', axis = 1)
features = features.drop('page_title', axis = 1)
features = features.drop('protection_expiry', axis = 1)
# features = features.drop('page_id', axis = 1)

# Replace NaN
features = features.replace('Fewer than 30 watchers',np.NaN)
features = features.replace('There may or may not be a watching user visiting recent edits',np.NaN)

#Convert cols to Float
features['page_length'] = features['page_length'].astype(float)
features['total_edits'] = features['total_edits'].astype(float)
features['number_page_watchers'] = features['number_page_watchers'].astype(float)
features['number_page_watchers_recent_edits'] = features['number_page_watchers_recent_edits'].astype(float)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels



from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.38it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
RandomForestClassifier             0.81               0.80    None      0.80   
LGBMClassifier                     0.80               0.80    None      0.80   
XGBClassifier                      0.80               0.80    None      0.80   
ExtraTreesClassifier               0.79               0.78    None      0.78   
BaggingClassifier                  0.75               0.75    None      0.75   
DecisionTreeClassifier             0.71               0.71    None      0.71   
KNeighborsClassifier               0.70               0.70    None      0.70   
NuSVC                              0.69               0.69    None      0.69   
LogisticRegression                 0.67               0.67    None      0.66   
AdaBoostClassifier                 0.67               0.67    None      0.66   
SVC                                0.64 




In [376]:
#Apply imputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=13, random_state=0, criterion='gini', oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.25%


In [322]:
from sklearn.metrics import roc_auc_score


In [377]:
roc_auc_score(labels_encoded, rf.predict_proba(features), multi_class='ovr')

0.9910625172356262

In [380]:
r2_score(y_test, predictions)

0.6171872094783041

In [329]:
rf.predict_proba(features).shape

(4767, 4)

# K-fold cross validation

In [366]:
import pandas
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
import numpy as np

X = features
y = labels_encoded

y = np.array(y)
y = y.reshape(4767,)


scores = []

counter =0

cv = KFold(n_splits=10, random_state=42, shuffle=False)
for train_index, test_index in cv.split(X):
    counter +=1
    if(counter==3):
        third_split_X_train = X[train_index]
        third_split_y_train = y[train_index]
        third_split_X_test = X[test_index]
        third_split_y_test = y[test_index]
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)
    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

#     X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test)) 

Train Index:  [ 477  478  479  480  481  482  483  484  485  486  487  488  489  490
  491  492  493  494  495  496  497  498  499  500  501  502  503  504
  505  506  507  508  509  510  511  512  513  514  515  516  517  518
  519  520  521  522  523  524  525  526  527  528  529  530  531  532
  533  534  535  536  537  538  539  540  541  542  543  544  545  546
  547  548  549  550  551  552  553  554  555  556  557  558  559  560
  561  562  563  564  565  566  567  568  569  570  571  572  573  574
  575  576  577  578  579  580  581  582  583  584  585  586  587  588
  589  590  591  592  593  594  595  596  597  598  599  600  601  602
  603  604  605  606  607  608  609  610  611  612  613  614  615  616
  617  618  619  620  621  622  623  624  625  626  627  628  629  630
  631  632  633  634  635  636  637  638  639  640  641  642  643  644
  645  646  647  648  649  650  651  652  653  654  655  656  657  658
  659  660  661  662  663  664  665  666  667  668  669  670  6

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

Train Index:  [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  186  187  188  189  190  191  192  193  1

In [367]:
scores

[0.031446540880503145,
 0.5178197064989518,
 0.8616352201257862,
 0.9475890985324947,
 0.9245283018867925,
 0.5534591194968553,
 0.6058700209643606,
 0.592436974789916,
 0.39915966386554624,
 0.47058823529411764]

In [362]:
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
# evaluate model
scores = cross_val_score(rf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.784 (0.015)


In [365]:
mean(scores)

0.5904532882335325

In [369]:
len(third_split_X_train)

4290

In [370]:
len(third_split_X_test)

477

In [383]:
rf.fit(third_split_X_train, third_split_y_train)

#Predict
y_pred = rf.predict(third_split_X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(third_split_y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 86.16%


In [386]:
from sklearn.metrics import r2_score
r2_score(third_split_y_test, predictions)

0.6890044052863435

In [384]:
len(predictions)

477

In [385]:
third_split_y_test.shape

(477,)