In [1]:
import pandas as pd
import numpy as np

In [2]:
sklearn_results_file = '/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/sklearn_results.csv'
bi_lstm_results_file = '/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/bidirectional_LSTM_results.csv'
nn_results_file = '/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/nn_results.csv'
data_file = '/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/fully_combined_data.csv'

sk_df = pd.read_csv(sklearn_results_file).rename(columns={'Unnamed: 0':'index',
                                                          'Label':'label',
                                                          '0' : 'pred_m1',
                                                          '1' : 'pred_m2',
                                                          '2' : 'pred_m3'}).sort_values('index', ignore_index = True)
sk_df = sk_df[['index', 'label', 'pred_m1', 'pred_m2', 'pred_m3']]

lstm_df = pd.read_csv(bi_lstm_results_file).rename(columns={'pred':'pred_lstm'})
nn_df = pd.read_csv(nn_results_file).rename(columns={'pred':'pred_nn'})
data = pd.read_csv(data_file, header = None).rename(columns={0:'text', 1:'label'})

sk_df.head(5)
# len(sk_df)

Unnamed: 0,index,label,pred_m1,pred_m2,pred_m3
0,0,1,0,0,1
1,1,0,0,0,0
2,2,1,1,1,1
3,3,1,1,0,1
4,4,1,1,0,1


In [3]:
len(sk_df)

33752

In [4]:
# merge for incomplete data - comment out if entire dataset present

# ensemble = sk_df.merge(nn_df, on = 'index')
# ensemble = ensemble.merge(lstm_df, on = 'index' )
# ensemble

In [5]:
# small dataset for testing

# sk_df = sk_df.head(10)
# nn_df = nn_df.head(10)
# lstm_df = lstm_df.head(10)

In [6]:
# join the results 

# pred_m1 is linearSVC, pred_m2 is gaussianNB, pred_m3 is multinomialNB

voting_models = ['pred_m1', 'pred_m2', 'pred_m3','pred_nn','pred_lstm'] # all models
# voting_models = ['pred_m1','pred_nn','pred_lstm']

ensemble = sk_df.join(nn_df['pred_nn'])
ensemble = ensemble.join(lstm_df['pred_lstm'])

In [7]:
# get majority vote

majority_vote = ensemble[voting_models].mode(axis=1).iloc[:,0].astype(int)

ensemble = ensemble.join(majority_vote).rename(columns={0:'pred_ensemble'})
ensemble

Unnamed: 0,index,label,pred_m1,pred_m2,pred_m3,pred_nn,pred_lstm,pred_ensemble
0,0,1,0,0,1,0,1,0
1,1,0,0,0,0,0,0,0
2,2,1,1,1,1,0,1,1
3,3,1,1,0,1,1,1,1
4,4,1,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...
33747,33747,0,0,0,0,0,0,0
33748,33748,1,1,1,1,1,1,1
33749,33749,0,0,0,0,0,0,0
33750,33750,0,0,0,0,0,0,0


In [8]:
ensemble[['label','pred_ensemble']]

Unnamed: 0,label,pred_ensemble
0,1,0
1,0,0
2,1,1
3,1,1
4,1,1
...,...,...
33747,0,0
33748,1,1
33749,0,0
33750,0,0


In [9]:
# get accuracy of the ensemble
# function to calculate true positive, true negative, false positive, false negatives

def perf_measure(y, pred):
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(len(y)):
        if y[i] == pred[i] == 1:
            tp += 1
        if y[i] == 1 and y[i] != pred[i]:
            fp += 1
        if y[i] == pred[i] == 0:
            tn += 1
        if y[i] == 0 and y[i] != pred[i]:
            fn += 1
    return tp, fp, tn, fn

y = ensemble['label'].to_numpy()
pred = ensemble['pred_ensemble'].to_numpy()
tp, fp, tn, fn = perf_measure(y, pred)

# accuracy
(tp + tn) / (tp + fp + tn + fn)

0.9472920123251956

In [10]:
# precision

tp / (tp + fp)

0.9522403316485888

In [11]:
# recall

tp / (tp + fn)

0.9470236078165594

In [12]:
# get difficulty of data point, ie. how many classifiers got it right?

data = data.join(ensemble[['pred_m1', 'pred_m2', 'pred_m3', 'pred_nn', 'pred_lstm']])
data

Unnamed: 0,text,label,pred_m1,pred_m2,pred_m3,pred_nn,pred_lstm
0,recommend view accurately someone dream change...,1,0,0,1,0,1
1,dec prod panther pipeline demand charge please...,0,0,0,0,0,0
2,refill notification ref voyage scalp overseer ...,1,1,1,1,0,1
3,reference ohi hello send email days ago want c...,1,1,0,1,1,1
4,important online bank alert dear value tcf ban...,1,1,0,1,1,1
...,...,...,...,...,...,...,...
33747,additional bid criteria ask go back shippers s...,0,0,0,0,0,0
33748,asterisk users asterisk debian lenny without j...,1,1,1,1,1,1
33749,steve trip houston dale richard ready cover co...,0,0,0,0,0,0
33750,conversation speak gregg chicago transactions ...,0,0,0,0,0,0


In [13]:
# data = data.head(10)
# data

In [14]:
row = data.iloc[0]
row

text         recommend view accurately someone dream change...
label                                                        1
pred_m1                                                      0
pred_m2                                                      0
pred_m3                                                      1
pred_nn                                                      0
pred_lstm                                                    1
Name: 0, dtype: object

In [15]:
preds = ['pred_m1', 'pred_m2', 'pred_m3','pred_nn','pred_lstm']

data.loc[:,'correct_count'] = np.nan

for i, row in data.iterrows():
    correct_count = 0
    for p in preds:
        if row[p] == row['label']:
            correct_count += 1
    data.loc[i, 'correct_count'] = int(correct_count)
    # print(correct_count)

data

Unnamed: 0,text,label,pred_m1,pred_m2,pred_m3,pred_nn,pred_lstm,correct_count
0,recommend view accurately someone dream change...,1,0,0,1,0,1,2.0
1,dec prod panther pipeline demand charge please...,0,0,0,0,0,0,5.0
2,refill notification ref voyage scalp overseer ...,1,1,1,1,0,1,4.0
3,reference ohi hello send email days ago want c...,1,1,0,1,1,1,4.0
4,important online bank alert dear value tcf ban...,1,1,0,1,1,1,4.0
...,...,...,...,...,...,...,...,...
33747,additional bid criteria ask go back shippers s...,0,0,0,0,0,0,5.0
33748,asterisk users asterisk debian lenny without j...,1,1,1,1,1,1,5.0
33749,steve trip houston dale richard ready cover co...,0,0,0,0,0,0,5.0
33750,conversation speak gregg chicago transactions ...,0,0,0,0,0,0,5.0


In [16]:
data.to_csv('ensemble_correct_count.csv')