In [1]:
import lazypredict
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

#Read data file

import pandas as pd

filepath = "balanced_dataset_MASTER.csv"
df = pd.read_csv(filepath)
features = df

# Labels are the values we want to predict
labels = np.array(df['protection_level'])

# 0 => unprotected
# 1 => autoconfirmed
# 2 => extendedconfirmed
# 3 => sysop
labels_encoded = []
for item in labels:
    if(item =="unprotected"):
        labels_encoded.append(0)
    elif(item == "autoconfirmed"):
        labels_encoded.append(1)
    elif(item == "extendedconfirmed"):
        labels_encoded.append(2)
    elif(item == "sysop"):
        labels_encoded.append(3)  
labels_encoded

# Remove the labels from the features
features = features.drop('protection_level', axis = 1)
features = features.drop('page_title', axis = 1)
features = features.drop('protection_expiry', axis = 1)
# features = features.drop('page_id', axis = 1)

# Replace NaN
features = features.replace('Fewer than 30 watchers',np.NaN)
features = features.replace('There may or may not be a watching user visiting recent edits',np.NaN)

#Convert cols to Float
features['page_length'] = features['page_length'].astype(float)
features['total_edits'] = features['total_edits'].astype(float)
features['number_page_watchers'] = features['number_page_watchers'].astype(float)
features['number_page_watchers_recent_edits'] = features['number_page_watchers_recent_edits'].astype(float)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels



from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.41it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
RandomForestClassifier             0.81               0.80    None      0.80   
LGBMClassifier                     0.80               0.80    None      0.80   
XGBClassifier                      0.80               0.80    None      0.80   
ExtraTreesClassifier               0.79               0.78    None      0.78   
BaggingClassifier                  0.75               0.75    None      0.75   
DecisionTreeClassifier             0.71               0.71    None      0.71   
KNeighborsClassifier               0.70               0.70    None      0.70   
NuSVC                              0.69               0.69    None      0.69   
LogisticRegression                 0.67               0.67    None      0.66   
AdaBoostClassifier                 0.67               0.67    None      0.66   
SVC                                0.64 




# Outlier removal - minimum covariant dependant

In [2]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

features = imputer.fit_transform(features)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels

In [4]:
X_train.shape

(3813, 11)

In [6]:
from sklearn.covariance import EllipticEnvelope
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

In [8]:
mask = yhat != -1
y_train = np.array(y_train)
X_train, y_train = X_train[mask, :], y_train[mask]

In [9]:
X_train.shape

(3774, 11)

In [10]:
# Random Forest Classifier implementation 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_depth=13, random_state=0, criterion='gini', oob_score= True, n_jobs=4)
rf.fit(X_train, y_train)

#Predict
y_pred = rf.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

Accuracy: 78.5115%
