In [2]:
import pickle
# save the model to disk
filename = 'models/model_86.sav'

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))



In [21]:
def prepare_data(filepath):
    import pandas as pd
    import numpy as np
    
    df = pd.read_csv(filepath)
    
    features = df 
    labels = df['protection_status']
    
    list_drop_columns = ['page_id', 'page_title', 'page_id_scrapped', 'protection_status']
    features = features.drop(list_drop_columns, axis=1)
    
    #Encoding string
    #Encode string to float
    features.loc[features['page_watchers'] == "Fewer than 30 watchers", 'page_watchers'] = -1
    features.loc[features['page_watchers_recent_edits'] == "There may or may not be a watching user visiting recent edits", 'page_watchers_recent_edits'] = -2
    features = features.fillna(-5)

    #Convert String to Floats
    features['page_length'] = features['page_length'].astype(float)
    features['edit_count'] = features['edit_count'].astype(float)
    features['page_watchers'] = features['page_watchers'].astype(float)
    features['page_watchers_recent_edits'] = features['page_watchers_recent_edits'].astype(float)
    
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    
    #Label encoding for protection_status column

    # 0 => unprotected
    # 1 => autoconfirmed
    # 2 => extendedconfirmed
    # 3 => sysop
    labels_encoded = []
    for item in labels:
        if(item =="unprotected"):
            labels_encoded.append(0)
        elif(item == "autoconfirmed"):
            labels_encoded.append(1)
        elif(item == "extendedconfirmed"):
            labels_encoded.append(2)
        elif(item == "sysop"):
            labels_encoded.append(3)  
    
    from sklearn.model_selection import train_test_split
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size = 0.20, random_state = 25)
        
    return train_features, test_features, train_labels, test_labels

In [22]:
from sklearn.metrics import accuracy_score
filepath = "../protect-wiki/dataset/balanced_dataset2.csv"
X_train, X_test, y_train, y_test = prepare_data(filepath)

#Predict
y_pred = loaded_model.predict(X_test)
predictions = [round(value) for value in y_pred]


#Evaluate
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 37.13%


In [16]:
import pandas as pd
df = pd.read_csv(filepath)

features = df

In [17]:
features[
    'page_watchers'
]

0                         194
1                         135
2                         264
3                          32
4                          46
                ...          
828                       112
829    Fewer than 30 watchers
830                        45
831                        46
832    Fewer than 30 watchers
Name: page_watchers, Length: 833, dtype: object

In [18]:
features.loc[features['page_watchers'] == "Fewer than 30 watchers", 'page_watchers'] = -1

In [19]:
features[
    'page_watchers'
]

0      194
1      135
2      264
3       32
4       46
      ... 
828    112
829     -1
830     45
831     46
832     -1
Name: page_watchers, Length: 833, dtype: object

In [20]:
features['page_watchers'].astype(float)

0      194.0
1      135.0
2      264.0
3       32.0
4       46.0
       ...  
828    112.0
829     -1.0
830     45.0
831     46.0
832     -1.0
Name: page_watchers, Length: 833, dtype: float64