<a href="https://colab.research.google.com/github/benjuarez8/CS81C-Research-Bolivia/blob/main/cs81c_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 24.5 MB/s eta 0:00:01[K     |██▉                             | 20 kB 29.7 MB/s eta 0:00:01[K     |████▏                           | 30 kB 28.4 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 19.8 MB/s eta 0:00:01[K     |███████                         | 51 kB 8.7 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 10.1 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 9.6 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 9.9 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 10.9 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 9.1 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 9.1 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 9.1 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 9.1 MB/s eta 0:00

In [53]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay
from xgboost.sklearn import XGBClassifier
from unidecode import unidecode

#Load Data

In [3]:
training_data = pd.read_csv("https://raw.githubusercontent.com/benjuarez8/Bolivia-SURF-2020/master/CS81C/training_data.csv")
training_data.rename( columns={"Unnamed: 0":"ID"}, inplace=True )
testing_data = pd.read_csv("https://raw.githubusercontent.com/benjuarez8/Bolivia-SURF-2020/master/CS81C/testing_data.csv")
testing_data.rename( columns={"Unnamed: 0":"ID"}, inplace=True )

#Data Processing

In [4]:
training_data["Fraud_num"] = [ -1 if type == "VS" else 1 if type == "BBS" else 0 for type in training_data["Fraud.type"] ]

In [5]:
training_departments = training_data["Department"].str.get_dummies()
training_data = pd.concat([training_data, training_departments], axis=1)
testing_departments = testing_data["Department"].str.get_dummies()
testing_data = pd.concat([testing_data, testing_departments], axis=1)
testing_data.rename( columns={"Potosí":unidecode("Potosí")}, inplace=True)

In [6]:
train_labels = training_data["Fraud_num"]
train = training_data[["Beni", "Chuquisaca", "Cochabamba", "La Paz", "Oruro", "Pando", "Potosi", "Santa Cruz", "Tarija", "Turnout", "MAS.vote"]]
test = testing_data[["Beni", "Chuquisaca", "Cochabamba", "La Paz", "Oruro", "Pando", "Potosi", "Santa Cruz", "Tarija", "Turnout", "MAS.vote"]]

In [7]:
X_train = np.array(train)
y_train = np.array(train_labels)
X_test = np.array(test)

#Classification Models

In [8]:
rf_model = RandomForestClassifier(n_estimators=400, random_state=1)
gb_model = GradientBoostingClassifier()
knn_model = KNeighborsClassifier()
logr_model = LogisticRegression(max_iter = 1000)
xgb_model = XGBClassifier()

# Ensemble (averaging approach)

In [9]:
seed = 8
np.random.seed(seed)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
logr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [10]:
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
logr_pred = logr_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

In [11]:
avg_pred = (rf_pred + gb_pred + knn_pred + logr_pred + xgb_pred) // 5

In [12]:
avg_ensemble_test = test.copy()
pd.options.mode.chained_assignment = None
avg_ensemble_test["Pred_Fraud_num"] = avg_pred.tolist()
avg_ensemble_test["Pred_Fraud_type"] = [ "VS" if type == -1 else "BBS" if 
                                        type == 1 else "Clean" for type in 
                                        avg_ensemble_test["Pred_Fraud_num"] ]
avg_ensemble_test["Department"] = testing_data["Department"]
avg_ensemble_test["Precinct"] = testing_data["Precinct"]
avg_ensemble_test["Municipality"] = testing_data["Municipality"]

In [13]:
avg_ensemble_test.to_csv("ensemble_avg_results.csv", index=False)

In [14]:
col_names = ["Clean", "At_Risk", "BBS_Risk", "VS_Risk", "Avg_Turnout", 
             "Avg_MAS_vote"]
avg_ensemble_results = pd.DataFrame(columns = col_names)
counts = avg_ensemble_test.Pred_Fraud_type.value_counts()
clean = counts[0] / (counts[0] + counts[1] + counts[2])
tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
bbs = counts[1] / (counts[0] + counts[1] + counts[2])
vs = counts[2] / (counts[0] + counts[1] + counts[2])
avg_turnout = avg_ensemble_test["Turnout"].mean()
avg_vote = avg_ensemble_test["MAS.vote"].mean()
avg_ensemble_results.loc[len(avg_ensemble_results)] = [round(100*clean,2), 
                                                       round(100*tot_risk,2), 
                                                       round(100*bbs,2), 
                                                       round(100*vs,2), 
                                                       round(100*avg_turnout,2), 
                                                       round(avg_vote,2)]

In [15]:
avg_ensemble_results.to_csv("ensemble_avg_tot_results.csv", index=False)
avg_ensemble_results

Unnamed: 0,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,89.74,10.26,5.62,4.64,89.8,0.46


In [16]:
col_names = ["Department", "Clean", "At_Risk", "BBS_Risk", "VS_Risk", 
             "Avg_Turnout", "Avg_MAS_vote"]
avg_ensemble_dep_results = pd.DataFrame(columns = col_names)
for dep in avg_ensemble_test["Department"].unique():
    counts = avg_ensemble_test[avg_ensemble_test["Department"] == 
                               dep].Pred_Fraud_type.value_counts()
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    avg_turnout = avg_ensemble_test[avg_ensemble_test["Department"] == 
                                    dep]["Turnout"].mean()
    avg_vote = avg_ensemble_test[avg_ensemble_test["Department"] == 
                                 dep]["MAS.vote"].mean()
    avg_ensemble_dep_results.loc[len(avg_ensemble_dep_results)] = [dep, 
                                                      round(100*clean,2), 
                                                      round(100*tot_risk,2), 
                                                      round(100*bbs,2), 
                                                      round(100*vs,2), 
                                                      round(100*avg_turnout,2), 
                                                      round(avg_vote,2)]
avg_ensemble_dep_results = avg_ensemble_dep_results.sort_values(by = "Clean")
avg_ensemble_dep_results.reset_index(drop=True, inplace=True)

In [17]:
avg_ensemble_dep_results.to_csv("ensemble_avg_dep_results.csv", index=False)
avg_ensemble_dep_results

Unnamed: 0,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Cochabamba,82.5,17.5,12.87,4.63,90.28,0.57
1,Potosí,86.49,13.51,9.96,3.55,88.87,0.46
2,La Paz,89.18,10.82,6.88,3.94,91.75,0.53
3,Oruro,91.1,8.9,6.11,2.8,91.09,0.47
4,Chuquisaca,91.78,8.22,6.37,1.85,88.41,0.4
5,Santa Cruz,93.74,6.26,3.74,2.52,88.83,0.34
6,Tarija,94.3,5.7,3.45,2.25,87.26,0.39
7,Pando,95.04,4.96,3.05,1.91,84.31,0.43
8,Beni,95.34,4.66,2.98,1.68,86.17,0.34


In [18]:
col_names = ["Municipality", "Department", "Clean", "At_Risk", "BBS_Risk", 
             "VS_Risk", "Avg_Turnout", "Avg_MAS_vote"]
avg_ensemble_mun_results = pd.DataFrame(columns = col_names)
for municip in avg_ensemble_test["Municipality"].unique():
  dep = avg_ensemble_test.loc[avg_ensemble_test["Municipality"] == 
                              municip]["Department"].iloc[0]
  counts = avg_ensemble_test[avg_ensemble_test["Municipality"] == 
                             municip].Pred_Fraud_type.value_counts()
  if (len(counts) > 2):
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    avg_turnout = avg_ensemble_test[avg_ensemble_test["Municipality"] == 
                                    municip]["Turnout"].mean()
    avg_vote = avg_ensemble_test[avg_ensemble_test["Municipality"] == 
                                 municip]["MAS.vote"].mean()
    avg_ensemble_mun_results.loc[len(avg_ensemble_mun_results)] = [municip, dep, 
                                                    round(100*clean,2), 
                                                    round(100*tot_risk,2), 
                                                    round(100*bbs,2), 
                                                    round(100*vs,2), 
                                                    round(100*avg_turnout,2), 
                                                    round(avg_vote,2)]
avg_ensemble_mun_results = avg_ensemble_mun_results.sort_values(by = 
                                                                    "Clean")
avg_ensemble_mun_results.reset_index(drop=True, inplace=True)

In [19]:
avg_ensemble_mun_results.to_csv("ensemble_avg_mun_results.csv", index=False)
avg_ensemble_mun_results

Unnamed: 0,Municipality,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Chacarilla,La Paz,33.33,66.67,33.33,33.33,90.33,0.81
1,Malla,La Paz,33.33,66.67,33.33,33.33,96.27,0.83
2,Pojo,Cochabamba,42.86,57.14,42.86,14.29,92.67,0.88
3,Sacaca,Potosí,45.45,54.55,30.30,24.24,93.73,0.85
4,Papel Pampa,La Paz,45.45,54.55,36.36,18.18,90.87,0.82
...,...,...,...,...,...,...,...,...
115,Tarija,Tarija,96.18,3.82,3.27,0.55,88.68,0.31
116,Santa Cruz de La Sierra,Santa Cruz,96.50,3.50,3.37,0.13,88.90,0.28
117,Nuestra Señora de La Paz,La Paz,96.67,3.33,2.95,0.39,91.00,0.40
118,Montero,Santa Cruz,96.69,3.31,2.21,1.10,88.64,0.39


#Ensemble (weighted average approach)

In [20]:
seed = 8
np.random.seed(seed)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
logr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [21]:
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
logr_pred = logr_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

In [22]:
rf_weight = 0.35
gb_weight = 0.15
knn_weight =  0.15
logr_weight = 0.05
xgb_weight = 0.3
assert(rf_weight + gb_weight + knn_weight + logr_weight + xgb_weight == 1)
weighted_pred = (rf_pred*rf_weight + gb_pred*gb_weight + knn_pred*knn_weight + 
                 logr_pred*logr_weight + xgb_pred*xgb_weight)

In [23]:
weighted_ensemble_test = test.copy()
pd.options.mode.chained_assignment = None
weighted_ensemble_test["Pred_Fraud_num"] = weighted_pred.tolist()
weighted_ensemble_test["Pred_Fraud_type"] = [ "VS" if type == -1 else "BBS" if 
                                    type == 1 else "Clean" for type in 
                                    weighted_ensemble_test["Pred_Fraud_num"] ]
weighted_ensemble_test["Department"] = testing_data["Department"]
weighted_ensemble_test["Precinct"] = testing_data["Precinct"]
weighted_ensemble_test["Municipality"] = testing_data["Municipality"]

In [24]:
weighted_ensemble_test.to_csv("ensemble_weighted_results.csv", index=False)

In [25]:
col_names = ["Clean", "At_Risk", "BBS_Risk", "VS_Risk", "Avg_Turnout", 
             "Avg_MAS_vote"]
weighted_ensemble_results = pd.DataFrame(columns = col_names)
counts = weighted_ensemble_test.Pred_Fraud_type.value_counts()
clean = counts[0] / (counts[0] + counts[1] + counts[2])
tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
bbs = counts[1] / (counts[0] + counts[1] + counts[2])
vs = counts[2] / (counts[0] + counts[1] + counts[2])
avg_turnout = weighted_ensemble_test["Turnout"].mean()
avg_vote = weighted_ensemble_test["MAS.vote"].mean()
l = len(weighted_ensemble_results)
weighted_ensemble_results.loc[l] = [round(100*clean,2), round(100*tot_risk,2), 
                                      round(100*bbs,2), round(100*vs,2), 
                                      round(100*avg_turnout,2), 
                                      round(avg_vote,2)]

In [26]:
weighted_ensemble_results.to_csv("ensemble_weighted_tot_results.csv", index=False)
weighted_ensemble_results

Unnamed: 0,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,91.93,8.07,4.64,3.43,89.8,0.46


In [27]:
col_names = ["Department", "Clean", "At_Risk", "BBS_Risk", "VS_Risk", 
             "Avg_Turnout", "Avg_MAS_vote"]
weighted_ensemble_dep_results = pd.DataFrame(columns = col_names)
for dep in weighted_ensemble_test["Department"].unique():
    counts = weighted_ensemble_test[weighted_ensemble_test["Department"] == 
                                    dep].Pred_Fraud_type.value_counts()
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    avg_turnout = weighted_ensemble_test[weighted_ensemble_test["Department"] 
                                         == dep]["Turnout"].mean()
    weighted_vote = weighted_ensemble_test[weighted_ensemble_test["Department"] 
                                           == dep]["MAS.vote"].mean()
    l = len(weighted_ensemble_dep_results)
    weighted_ensemble_dep_results.loc[l] = [dep, round(100*clean,2), 
                                              round(100*tot_risk,2), 
                                              round(100*bbs,2), round(100*vs,2), 
                                              round(100*avg_turnout,2), 
                                              round(avg_vote,2)]
weighted_ensemble_dep_results = weighted_ensemble_dep_results.sort_values(by = 
                                                                        "Clean")
weighted_ensemble_dep_results.reset_index(drop=True, inplace=True)

In [28]:
weighted_ensemble_dep_results.to_csv("ensemble_weighted_dep_results.csv", 
                                     index=False)
weighted_ensemble_dep_results

Unnamed: 0,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Cochabamba,86.48,13.52,8.89,4.63,90.28,0.46
1,La Paz,91.07,8.93,6.88,2.05,91.75,0.46
2,Potosí,92.18,7.82,4.28,3.55,88.87,0.46
3,Oruro,92.27,7.73,6.11,1.62,91.09,0.46
4,Chuquisaca,93.21,6.79,4.93,1.85,88.41,0.46
5,Santa Cruz,94.85,5.15,3.74,1.41,88.83,0.46
6,Pando,95.42,4.58,3.05,1.53,84.31,0.46
7,Tarija,95.56,4.44,2.25,2.19,87.26,0.46
8,Beni,97.02,2.98,1.68,1.3,86.17,0.46


In [29]:
col_names = ["Municipality", "Department", "Clean", "At_Risk", "BBS_Risk", 
             "VS_Risk", "Avg_Turnout", "Avg_MAS_vote"]
weighted_ensemble_mun_results = pd.DataFrame(columns = col_names)
for municip in weighted_ensemble_test["Municipality"].unique():
  dep = weighted_ensemble_test.loc[weighted_ensemble_test["Municipality"] == 
                                   municip]["Department"].iloc[0]
  counts = weighted_ensemble_test[weighted_ensemble_test["Municipality"] == 
                                  municip].Pred_Fraud_type.value_counts()
  if (len(counts) > 2): # will remove some municipalities from results
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    mun = "Municipality"
    weighted_turnout = weighted_ensemble_test[weighted_ensemble_test[mun] == 
                                              municip]["Turnout"].mean()
    weighted_vote = weighted_ensemble_test[weighted_ensemble_test[mun] == 
                                           municip]["MAS.vote"].mean()
    l = len(weighted_ensemble_mun_results)
    weighted_ensemble_mun_results.loc[l] = [municip, dep, 
                                                  round(100*clean,2), 
                                                  round(100*tot_risk,2), 
                                                  round(100*bbs,2), 
                                                  round(100*vs,2), 
                                                  round(100*avg_turnout,2), 
                                                  round(avg_vote,2)]
weighted_ensemble_mun_results = weighted_ensemble_mun_results.sort_values(by = 
                                                                        "Clean")
weighted_ensemble_mun_results.reset_index(drop=True, inplace=True)

In [30]:
weighted_ensemble_mun_results.to_csv("ensemble_weighted_mun_results.csv", 
                                     index=False)
weighted_ensemble_mun_results

Unnamed: 0,Municipality,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Chacarilla,La Paz,33.33,66.67,33.33,33.33,84.31,0.46
1,Malla,La Paz,33.33,66.67,33.33,33.33,84.31,0.46
2,Papel Pampa,La Paz,45.45,54.55,36.36,18.18,84.31,0.46
3,Arque,Cochabamba,47.37,52.63,47.37,5.26,84.31,0.46
4,Pojo,Cochabamba,47.62,52.38,38.10,14.29,84.31,0.46
...,...,...,...,...,...,...,...,...
95,Nuestra Señora de La Paz,La Paz,96.91,3.09,2.95,0.14,84.31,0.46
96,Cliza,Cochabamba,97.06,2.94,1.47,1.47,84.31,0.46
97,San Lorenzo,Tarija,97.22,2.78,1.39,1.39,84.31,0.46
98,Montero,Santa Cruz,97.79,2.21,1.10,1.10,84.31,0.46


#Ensemble (voting approach)

In [31]:
seed = 8
np.random.seed(seed)
voting_model = VotingClassifier(estimators = [("RF", rf_model), 
                                              ("XGB", xgb_model), 
                                              ("KNN", knn_model), 
                                              ("LogR", logr_model), 
                                              ("GB", gb_model)], 
                                voting="hard")
voting_model.fit(X_train, y_train)

VotingClassifier(estimators=[('RF',
                              RandomForestClassifier(n_estimators=400,
                                                     random_state=1)),
                             ('XGB', XGBClassifier(objective='multi:softprob')),
                             ('KNN', KNeighborsClassifier()),
                             ('LogR', LogisticRegression(max_iter=1000)),
                             ('GB', GradientBoostingClassifier())])

In [32]:
voting_pred = voting_model.predict(X_test)

In [33]:
voting_ensemble_test = test.copy()
pd.options.mode.chained_assignment = None
voting_ensemble_test["Pred_Fraud_num"] = voting_pred.tolist()
voting_ensemble_test["Pred_Fraud_type"] = [ "VS" if type == -1 else "BBS" if 
                                  type == 1 else "Clean" for type in 
                                  voting_ensemble_test["Pred_Fraud_num"] ]
voting_ensemble_test["Department"] = testing_data["Department"]
voting_ensemble_test["Precinct"] = testing_data["Precinct"]
voting_ensemble_test["Municipality"] = testing_data["Municipality"]

In [34]:
voting_ensemble_test.to_csv("ensemble_voting_results.csv", index=False)

In [35]:
col_names = ["Clean", "At_Risk", "BBS_Risk", "VS_Risk", "Avg_Turnout", 
             "Avg_MAS_vote"]
voting_ensemble_results = pd.DataFrame(columns = col_names)
counts = voting_ensemble_test.Pred_Fraud_type.value_counts()
clean = counts[0] / (counts[0] + counts[1] + counts[2])
tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
bbs = counts[1] / (counts[0] + counts[1] + counts[2])
vs = counts[2] / (counts[0] + counts[1] + counts[2])
avg_turnout = voting_ensemble_test["Turnout"].mean()
avg_vote = voting_ensemble_test["MAS.vote"].mean()
voting_ensemble_results.loc[len(avg_ensemble_results)] = [round(100*clean,2), 
                                                  round(100*tot_risk,2), 
                                                  round(100*bbs,2), 
                                                  round(100*vs,2), 
                                                  round(100*avg_turnout,2), 
                                                  round(avg_vote,2)]

In [36]:
voting_ensemble_results.to_csv("ensemble_voting_tot_results.csv", index=False)
voting_ensemble_results

Unnamed: 0,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
1,85.86,14.14,9.44,4.7,89.8,0.46


In [37]:
col_names = ["Department", "Clean", "At_Risk", "BBS_Risk", "VS_Risk", 
             "Avg_Turnout", "Avg_MAS_vote"]
voting_ensemble_dep_results = pd.DataFrame(columns = col_names)
for dep in voting_ensemble_test["Department"].unique():
    counts = voting_ensemble_test[voting_ensemble_test["Department"] == 
                                  dep].Pred_Fraud_type.value_counts()
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    avg_turnout = voting_ensemble_test[voting_ensemble_test["Department"] == 
                                       dep]["Turnout"].mean()
    avg_vote = voting_ensemble_test[voting_ensemble_test["Department"] == 
                                    dep]["MAS.vote"].mean()
    voting_ensemble_dep_results.loc[len(voting_ensemble_dep_results)] = [dep, 
                                                  round(100*clean,2), 
                                                  round(100*tot_risk,2), 
                                                  round(100*bbs,2), 
                                                  round(100*vs,2), 
                                                  round(100*avg_turnout,2), 
                                                  round(avg_vote,2)]
voting_ensemble_dep_results = voting_ensemble_dep_results.sort_values(by = 
                                                                      "Clean")
voting_ensemble_dep_results.reset_index(drop=True, inplace=True)

In [38]:
voting_ensemble_dep_results.to_csv("ensemble_voting_dep_results.csv", 
                                   index=False)
voting_ensemble_dep_results

Unnamed: 0,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Cochabamba,77.8,22.2,11.51,10.69,90.28,0.57
1,La Paz,84.0,16.0,12.86,3.14,91.75,0.53
2,Potosí,86.74,13.26,6.78,6.48,88.87,0.46
3,Oruro,88.08,11.92,9.79,2.13,91.09,0.47
4,Santa Cruz,89.2,10.8,8.73,2.07,88.83,0.34
5,Chuquisaca,91.43,8.57,5.69,2.88,88.41,0.4
6,Beni,92.26,7.74,5.41,2.33,86.17,0.34
7,Tarija,93.83,6.17,3.38,2.79,87.26,0.39
8,Pando,95.42,4.58,3.05,1.53,84.31,0.43


In [39]:
col_names = ["Municipality", "Department", "Clean", "At_Risk", "BBS_Risk", 
             "VS_Risk", "Avg_Turnout", "Avg_MAS_vote"]
voting_ensemble_mun_results = pd.DataFrame(columns = col_names)
for municip in voting_ensemble_test["Municipality"].unique():
  dep = voting_ensemble_test.loc[voting_ensemble_test["Municipality"] == 
                                 municip]["Department"].iloc[0]
  counts = voting_ensemble_test[voting_ensemble_test["Municipality"] == 
                                municip].Pred_Fraud_type.value_counts()
  if (len(counts) > 2): # will remove some municipalities from results
    clean = counts[0] / (counts[0] + counts[1] + counts[2])
    tot_risk = (counts[1] + counts[2]) / (counts[0] + counts[1] + counts[2])
    bbs = counts[1] / (counts[0] + counts[1] + counts[2])
    vs = counts[2] / (counts[0] + counts[1] + counts[2])
    avg_turnout = voting_ensemble_test[voting_ensemble_test["Municipality"] == municip]["Turnout"].mean()
    avg_vote = voting_ensemble_test[voting_ensemble_test["Municipality"] == municip]["MAS.vote"].mean()
    l = len(voting_ensemble_mun_results)
    voting_ensemble_mun_results.loc[l] = [municip, dep, round(100*clean,2), 
                                          round(100*tot_risk,2), 
                                          round(100*bbs,2), round(100*vs,2), 
                                          round(100*avg_turnout,2), 
                                          round(avg_vote,2)]
voting_ensemble_mun_results = voting_ensemble_mun_results.sort_values(by = 
                                                                      "Clean")
voting_ensemble_mun_results.reset_index(drop=True, inplace=True)

In [40]:
voting_ensemble_mun_results.to_csv("ensemble_voting_mun_results.csv", 
                                       index=False)
voting_ensemble_mun_results

Unnamed: 0,Municipality,Department,Clean,At_Risk,BBS_Risk,VS_Risk,Avg_Turnout,Avg_MAS_vote
0,Chacarilla,La Paz,33.33,66.67,33.33,33.33,90.33,0.81
1,Caripuyo,Potosí,38.46,61.54,38.46,23.08,90.48,0.84
2,Pojo,Cochabamba,42.86,57.14,33.33,23.81,92.67,0.88
3,Papel Pampa,La Paz,45.45,54.55,36.36,18.18,90.87,0.82
4,Sacaca,Potosí,45.45,54.55,27.27,27.27,93.73,0.85
...,...,...,...,...,...,...,...,...
128,Bermejo,Tarija,95.16,4.84,3.23,1.61,84.90,0.45
129,Tarija,Tarija,95.36,4.64,4.37,0.27,88.68,0.31
130,Tupiza,Potosí,96.36,3.64,2.73,0.91,87.35,0.56
131,San Lorenzo,Tarija,97.22,2.78,1.39,1.39,87.64,0.44


#Cross-Validation / AUC / Scoring Comparisons

In [58]:
rf_model = RandomForestClassifier(n_estimators=400, random_state=1)
gb_model = GradientBoostingClassifier()
knn_model = KNeighborsClassifier()
logr_model = LogisticRegression(max_iter = 1000)
xgb_model = XGBClassifier()

In [64]:
X_train, X_test, y_train, y_test = train_test_split(train, train_labels, 
                                                    test_size = 0.3, 
                                                    random_state = seed)

In [65]:
seed = 8
np.random.seed(seed)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
logr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [68]:
rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test), 
                       multi_class="ovr")
xgb_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test), 
                        multi_class="ovr")
gb_auc = roc_auc_score(y_test, gb_model.predict_proba(X_test), 
                       multi_class="ovr")
knn_auc = roc_auc_score(y_test, knn_model.predict_proba(X_test), 
                        multi_class="ovr")
logr_auc = roc_auc_score(y_test, logr_model.predict_proba(X_test), 
                         multi_class="ovr")
# voting_auc = roc_auc_score(y_test, voting_model.predict_proba(X_test), 
#                            multi_class="ovr")
print('Random Forest ROC AUC: %.3f' % (rf_auc))
print('XGBoost ROC AUC: %.3f' % (xgb_auc))
print('Gradient Boosting ROC AUC: %.3f' % (gb_auc))
print('K-Nearest Neighbors ROC AUC: %.3f' % (knn_auc))
print('Logistic Regression ROC AUC: %.3f' % (logr_auc))

Random Forest ROC AUC: 0.997
XGBoost ROC AUC: 0.998
Gradient Boosting ROC AUC: 0.998
K-Nearest Neighbors ROC AUC: 0.995
Logistic Regression ROC AUC: 0.997


In [60]:
cv = KFold(n_splits=10, random_state=seed, shuffle=True)
rf_scores = cross_val_score(rf_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
xgb_scores = cross_val_score(xgb_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
logr_scores = cross_val_score(logr_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
gb_scores = cross_val_score(gb_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
knn_scores = cross_val_score(knn_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
voting_scores = cross_val_score(voting_model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported



KeyboardInterrupt: ignored

In [57]:
rf_disp = RocCurveDisplay.from_estimator(rf_model, X_test, y_test)
ax = plt.gca()
gb_disp = RocCurveDisplay.from_estimator(gb_model, X_test, y_test, ax=ax)
knn_disp = RocCurveDisplay.from_estimator(knn_model, X_test, y_test, ax=ax)
logr_disp = RocCurveDisplay.from_estimator(logr_model, X_test, y_test, ax=ax)
xgb_disp = RocCurveDisplay.from_estimator(xgb_model, X_test, y_test, ax=ax)
voting_disp = RocCurveDisplay.from_estimator(voting_model, X_test, y_test, 
                                             ax=ax)
plt.show()

ValueError: ignored