In [1]:
import lazypredict
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

#Read data file

import pandas as pd

filepath = "dataset_298_duplicates.csv"
df = pd.read_csv(filepath)
features = df

features

Unnamed: 0,page_id,page_title,view_count,protection_level,protection_expiry,page_length,number_page_watchers,number_page_watchers_recent_edits,number_of_redirects,page_views_past_30days,total_edits,recent_number_of_edits,number_distinct_authors,number_categories
0,340663,Paul Reiser,4433720,unprotected,,22427,76,9,4,80115,864.0,1.0,1.0,20.0
1,667049,Meg Tilly,3648535,unprotected,,24897,81,12,0,55019,962.0,8.0,4.0,26.0
2,2736939,Random number generation,3155183,unprotected,,33119,244,20,14,46350,946.0,7.0,6.0,15.0
3,39068821,Paige Howard,3151197,unprotected,,4265,34,8,0,26392,60.0,0.0,0.0,6.0
4,41414319,Sophia Amoruso,2885266,unprotected,,18081,Fewer than 30 watchers,,0,10757,193.0,0.0,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,64673036,1991 Iraqi rocket attacks on Israel,16270,extendedconfirmed,infinity,8783,Fewer than 30 watchers,,0,2693,49.0,2.0,2.0,5.0
996,11020994,1992 Israeli legislative election,72697,extendedconfirmed,infinity,12933,Fewer than 30 watchers,,2,812,126.0,5.0,1.0,3.0
997,433761,1992 attack on Israeli embassy in Buenos Aires,122245,extendedconfirmed,infinity,11054,96,6,9,981,323.0,0.0,0.0,15.0
998,693391,1994 London Israeli Embassy bombing,33299,extendedconfirmed,infinity,13569,77,5,12,440,204.0,2.0,2.0,7.0


In [2]:
# Labels are the values we want to predict
labels = np.array(df['protection_level'])

# 0 => unprotected
# 1 => autoconfirmed
# 2 => extendedconfirmed
# 3 => sysop
labels_encoded = []
for item in labels:
    if(item =="unprotected"):
        labels_encoded.append(0)
    elif(item == "autoconfirmed"):
        labels_encoded.append(1)
    elif(item == "extendedconfirmed"):
        labels_encoded.append(2)
    elif(item == "sysop"):
        labels_encoded.append(3)  

# Remove the labels from the features
features = features.drop('protection_level', axis = 1)
features = features.drop('page_title', axis = 1)
features = features.drop('protection_expiry', axis = 1)
features = features.drop('page_id', axis = 1)

#np.nan convers
features = features.replace('Fewer than 30 watchers',np.NaN)
features = features.replace('There may or may not be a watching user visiting recent edits',np.NaN)

features['page_length'] = features['page_length'].astype(float)
features['total_edits'] = features['total_edits'].astype(float)
features['number_page_watchers'] = features['number_page_watchers'].astype(float)
features['number_page_watchers_recent_edits'] = features['number_page_watchers_recent_edits'].astype(float)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

features

array([[4.4337200e+06, 2.2427000e+04, 7.6000000e+01, 9.0000000e+00,
        4.0000000e+00, 8.0115000e+04, 8.6400000e+02, 1.0000000e+00,
        1.0000000e+00, 2.0000000e+01],
       [3.6485350e+06, 2.4897000e+04, 8.1000000e+01, 1.2000000e+01,
        0.0000000e+00, 5.5019000e+04, 9.6200000e+02, 8.0000000e+00,
        4.0000000e+00, 2.6000000e+01],
       [3.1551830e+06, 3.3119000e+04, 2.4400000e+02, 2.0000000e+01,
        1.4000000e+01, 4.6350000e+04, 9.4600000e+02, 7.0000000e+00,
        6.0000000e+00, 1.5000000e+01],
       [3.1511970e+06, 4.2650000e+03, 3.4000000e+01, 8.0000000e+00,
        0.0000000e+00, 2.6392000e+04, 6.0000000e+01, 0.0000000e+00,
        0.0000000e+00, 6.0000000e+00],
       [2.8852660e+06, 1.8081000e+04,           nan,           nan,
        0.0000000e+00, 1.0757000e+04, 1.9300000e+02, 0.0000000e+00,
        0.0000000e+00, 1.5000000e+01],
       [2.4309890e+06, 4.8000000e+01, 3.3200000e+02, 3.2000000e+01,
        0.0000000e+00, 6.3310000e+03, 1.6080000e+03, 0.00

In [3]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size =0.20, random_state = 53)

X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels


from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.51it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      0.95               0.96    None      0.95   
LGBMClassifier                     0.94               0.95    None      0.94   
BaggingClassifier                  0.93               0.93    None      0.93   
RandomForestClassifier             0.92               0.92    None      0.92   
DecisionTreeClassifier             0.90               0.90    None      0.89   
ExtraTreesClassifier               0.89               0.89    None      0.88   
ExtraTreeClassifier                0.82               0.83    None      0.81   
KNeighborsClassifier               0.81               0.82    None      0.80   
CalibratedClassifierCV             0.74               0.74    None      0.74   
LogisticRegression                 0.74               0.74    None      0.74   
LinearSVC                          0.74 




In [4]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.95,0.96,,0.95,0.9
LGBMClassifier,0.94,0.95,,0.94,0.28
BaggingClassifier,0.93,0.93,,0.93,0.07
RandomForestClassifier,0.92,0.92,,0.92,0.31
DecisionTreeClassifier,0.9,0.9,,0.89,0.02
ExtraTreesClassifier,0.89,0.89,,0.88,0.19
ExtraTreeClassifier,0.82,0.83,,0.81,0.01
KNeighborsClassifier,0.81,0.82,,0.8,0.02
CalibratedClassifierCV,0.74,0.74,,0.74,0.72
LogisticRegression,0.74,0.74,,0.74,0.07


## XG Boost Classifier

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

Accuracy: 94.5000%


In [7]:
type(X_train)

numpy.ndarray