# Import data and drop irrelevant columns

In [1]:
import lazypredict
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

#Read data file
import pandas as pd

filepath = "trial_1200.csv"
df = pd.read_csv(filepath)
features = df

# Remove the labels from the features
features = features.drop('protection_level', axis = 1)
features = features.drop('page_title', axis = 1)
features = features.drop('protection_expiry', axis = 1)
features = features.drop('page_id', axis = 1)
features = features.drop('page_freshness', axis = 1)

features

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.0,22427,76,9,4,80115,864.0,1.0,1.0,20.0
1,3648535.0,24897,81,12,0,55019,962.0,8.0,4.0,26.0
2,3155183.0,33119,244,20,14,46350,946.0,7.0,6.0,15.0
3,3151197.0,4265,34,8,0,26392,60.0,0.0,0.0,6.0
4,2885266.0,18081,Fewer than 30 watchers,,0,10757,193.0,0.0,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.0,229220,754,194,59,260693,6563.0,14.0,9.0,26.0
4763,1107024.0,40783,107,5,6,17785,4432.0,0.0,0.0,11.0
4764,938453.0,55720,91,9,12,9383,2438.0,1.0,1.0,21.0
4765,410880.0,51713,44,7,7,5147,1783.0,12.0,2.0,15.0


In [2]:
# Labels are the values we want to predict
labels = np.array(df['protection_level'])

# 0 => unprotected
# 1 => autoconfirmed
# 2 => extendedconfirmed
# 3 => sysop
labels_encoded = []
for item in labels:
    if(item =="unprotected"):
        labels_encoded.append(0)
    elif(item == "autoconfirmed"):
        labels_encoded.append(1)
    elif(item == "extendedconfirmed"):
        labels_encoded.append(2)
    elif(item == "sysop"):
        labels_encoded.append(3)  

#np.nan convers
features = features.replace('Fewer than 30 watchers',np.NaN)
features = features.replace('There may or may not be a watching user visiting recent edits',np.NaN)

features['page_length'] = features['page_length'].astype(float)
features['total_edits'] = features['total_edits'].astype(float)
features['number_page_watchers'] = features['number_page_watchers'].astype(float)
features['number_page_watchers_recent_edits'] = features['number_page_watchers_recent_edits'].astype(float)

# Saving feature names for later use
feature_list = list(features.columns)

features

Unnamed: 0,view_count,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,4433720.0,22427.0,76.0,9.0,4,80115,864.0,1.0,1.0,20.0
1,3648535.0,24897.0,81.0,12.0,0,55019,962.0,8.0,4.0,26.0
2,3155183.0,33119.0,244.0,20.0,14,46350,946.0,7.0,6.0,15.0
3,3151197.0,4265.0,34.0,8.0,0,26392,60.0,0.0,0.0,6.0
4,2885266.0,18081.0,,,0,10757,193.0,0.0,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...
4762,65958868.0,229220.0,754.0,194.0,59,260693,6563.0,14.0,9.0,26.0
4763,1107024.0,40783.0,107.0,5.0,6,17785,4432.0,0.0,0.0,11.0
4764,938453.0,55720.0,91.0,9.0,12,9383,2438.0,1.0,1.0,21.0
4765,410880.0,51713.0,44.0,7.0,7,5147,1783.0,12.0,2.0,15.0


In [3]:
#Apply KNN Imputer for missing values
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
features = imputer.fit_transform(features)


from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels


from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.30it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
RandomForestClassifier             0.76               0.76    None      0.76   
LGBMClassifier                     0.75               0.75    None      0.75   
XGBClassifier                      0.74               0.74    None      0.74   
ExtraTreesClassifier               0.73               0.72    None      0.72   
BaggingClassifier                  0.73               0.72    None      0.72   
AdaBoostClassifier                 0.69               0.69    None      0.68   
DecisionTreeClassifier             0.66               0.65    None      0.66   
SVC                                0.65               0.65    None      0.64   
NuSVC                              0.65               0.64    None      0.64   
KNeighborsClassifier               0.65               0.64    None      0.64   
CalibratedClassifierCV             0.64 




# Best Model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score
import lightgbm as lgb

#Instantiate model
#model = lgb.LGBMClassifier()
model = RandomForestClassifier()

# fit model no training data
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

print("Best model:", str(model))

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy: %.4f%%" % (balanced_accuracy * 100.0))

f1_score = f1_score(y_test, y_pred, average='macro')
print("F1 Score : %.4f%%" % (f1_score * 100.0))

Best model: RandomForestClassifier()
Accuracy: 75.1572%
Balanced Accuracy: 74.7905%
F1 Score : 74.4747%
