Implementation of Naive Bayes Algorithm found in the paper Beyond Accuracy: ROI-driven Data Analytics of Empirical Data

In [114]:
import pandas as pd

#Import the data from 2 files (My computer runs out of memory with more)

#alldata = pd.read_csv('./dataset/AllData.csv', low_memory=False)
data_part1 = pd.read_csv('./dataset/Data_part1.csv', low_memory=False) #Only this dataset was used to calculate results, but more can easily be added
#data_part1Stage2 = pd.read_csv('./dataset/Data_part1Stage2.csv', low_memory=False)
#data_part2 = pd.read_csv('./dataset/Data_part2.csv', low_memory=False)
#data_part2Stage2 = pd.read_csv('./dataset/Data_part2Stage2.csv', low_memory=False)
#data_part3 = pd.read_csv('./dataset/Data_part3.csv', low_memory=False)
#data_part3Stage2 = pd.read_csv('./dataset/Data_part3Stage2.csv', low_memory=False)
#data_part4 = pd.read_csv('./dataset/Data_part4.csv', low_memory=False)
#data_part4Stage2 = pd.read_csv('./dataset/Data_part4Stage2.csv', low_memory=False)
#data_part5Stage2 = pd.read_csv('./dataset/Data_part5Stage2.csv', low_memory=False)

In [115]:
dataset_names = [data_part1] #[data_part1, data_part1Stage2] #Add more dataset names to this list to create larger dataset
complete_dataset =  pd.concat(dataset_names, ignore_index=True, sort=False).drop(columns=['Unnamed: 0']) #Put the files into one dataframe
complete_dataset = complete_dataset.drop(columns=list(complete_dataset.filter(regex = '^cf'))) #Remove all columns with cf
complete_dataset = complete_dataset.set_index('id') #Create an explicit index. Each row can now be refered to by it's ID

In [116]:
#Get all of the rows which depend on something
index_dataset = complete_dataset[['depends_on']]
index_dataset = index_dataset.dropna()

#If you have multiple values in one cell, split them into different columns (Ex: 2 items seperated by a comma = 2 columns)
targets = index_dataset['depends_on'].str.split(',', expand=True)
#Count the number of instances in each row
result = targets.apply(pd.Series.value_counts)
#Remove NaN values - they throw things off
result = result.fillna(0)

#Sum all the columns, based on row. This will give us the total count for how many entries 
#are dependent on another, with the index being the ID of the "depends_on" entry
result = result.sum(axis='columns').to_frame()

#Find the entries which have the most references in the "depends_on" column
result = result.sort_values(by = 0, ascending=False)

#Get the top 80 referenced entries
result = result.head(100)

#Make a list of the entries
indicies = result.index.values.tolist()

#Make sure the entries are in string format, with all whitespace stripped
indicies = list(map(str,indicies))
indicies = [i.strip() for i in indicies]
print(indicies)

['1130266', '1448703', '1529362', '1319773', '1231208', '1445197', '1575284', '1574241', '1460343', '1470432', '1471622', '1464426', '1444991', '1218018', '1478118', '1477783', '278458', '1217996', '1344091', '1441308', '1467949', '1353360', '1172897', '1476876', '1480504', '1457891', '910412', '1418995', '1472491', '1477638', '1212797', '1461360', '57805', '231429', '1462470', '1490242', '947490', '1449562', '1464828', '1574159', '426727', '1427928', '1350424', '252848', '1492566', '1354083', '1320475', '1476865', '912121', '1456555', '1191460', '790640', '1234485', '1356397', '1196785', '1073717', '1082598', '1436478', '1494403', '1472491', '1270763', '403137', '1467221', '1446830', '1464311', '1466549', '851471', '38447', '1247628', '899013', '903519', '1485081', '1250473', '1334655', '1475252', '1473530', '1250902', '1473772', '1342026', '1464123', '1488620', '1463587', '106592', '1457891', '1457500', '18729', '1357819', '1317102', '1452845', '1425941', '1425484', '1364359', '12722

In [129]:
#Import the library which has the stopwords (if, the, etc...)
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

#Give me the depends_on and summary column
complete_dataset = complete_dataset[['depends_on', 'summary']]
print(complete_dataset.shape) #Check how big the dataframe is

#Used to remove the stopwords
def removeStopWords(startingList):
    returnedList = [x for x in startingList if x not in stopwords.words('english')]
    return returnedList

def lemmatize(startingList):
    lmtzr = WordNetLemmatizer()
    returnedList = [lmtzr.lemmatize(x) for x in startingList]
    return returnedList

#Used to remove punctuation from the summary column
tokenizer = RegexpTokenizer(r'\w+')

#Get the summary column
summary_features = complete_dataset['summary']
summary_features = summary_features.to_frame()

#Remove punctuation
summary_features = summary_features.apply(lambda x: tokenizer.tokenize(x['summary']), axis=1)
#Remove stopwords
summary_features = summary_features.apply(lambda x: removeStopWords(x))
#Lemmatize
summary_features = summary_features.apply(lambda x: lemmatize(x))

#Get the "depends_on" column into its own dataframe, change each cell to hold a list of strings (each ID = 1 string) instead of one string
targets = complete_dataset['depends_on'].str.split(',').to_frame()
#Replace all NaN values with empty lists
targets['depends_on'] = targets['depends_on'].apply(lambda x: x if isinstance(x, list) else [])
summary_features = summary_features.to_frame()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/catcyr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/catcyr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/catcyr/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


(9879, 2)


In [130]:
from sklearn.preprocessing import MultiLabelBinarizer

#Change the summary dataframe to onehot encoding - each row can have more than one "1" because sentence have multiple words
mlb = MultiLabelBinarizer()
features = summary_features.join(pd.DataFrame(mlb.fit_transform(summary_features.pop(0)), columns=mlb.classes_, index=summary_features.index))
original_features = features

#Change the depends_on dataframe to onehot encoding
targets = targets.join(pd.DataFrame(mlb.fit_transform(targets.pop('depends_on')), columns=mlb.classes_, index=targets.index))

#Remove whitespaces in the column names
targets.columns = targets.columns.str.strip()
print(targets)

         1000077  1000199  1000317  1000462  1000745  1000775  1000814  \
id                                                                       
1572867        0        0        0        0        0        0        0   
1572870        0        0        0        0        0        0        0   
1572872        0        0        0        0        0        0        0   
1441804        0        0        0        0        0        0        0   
1048589        0        0        0        0        0        0        0   
...          ...      ...      ...      ...      ...      ...      ...   
1486848        0        0        0        0        0        0        0   
1355787        0        0        0        0        0        0        0   
1486859        0        0        0        0        0        0        0   
1486865        0        0        0        0        0        0        0   
1486866        0        0        0        0        0        0        0   

         1000870  1000879  100090  ..

In [131]:
#Select only the  top  80 dependencies
targets2 = targets[indicies]
#Remove duplicate rows and columns
targets2 = targets2.groupby(level=0, axis=1).sum()
targets2 = targets2.groupby(level=0, axis=0).sum()

#Create a single column of rows that have dependencies in the top 80 and those which do not
targets2 = targets2.max(axis = 1)
print(targets2)

id
915        0
2654       0
2678       0
2800       0
2892       0
          ..
1575563    0
1575565    0
1575567    0
1575575    0
1575582    0
Length: 9879, dtype: int64


In [132]:
#Reduce the overall number of entires
#Select all entries that are equal to 1 (has a dependency) and 0 (does not have a dependency)
zeroes = targets2[targets2 == 0]
ones = targets2[targets2 == 1]
print(len(zeroes))
print(len(ones))

#Sample list of zeroes and select the same as the number of dependency entries
zeroes_sampled = zeroes.sample(n = len(ones))

#combine lists and sort by index
zeroes_ones = pd.concat([zeroes_sampled, ones])
zeroes_ones = zeroes_ones.sort_index(ascending = True)

targets2 = zeroes_ones

print(zeroes_ones)

9451
411
id
2800       0
5704       0
11056      1
19251      0
32023      0
          ..
1575279    1
1575280    1
1575281    1
1575282    1
1575283    1
Length: 822, dtype: int64


In [133]:
#Get the indecies of the selected target values
target_indicies = targets2.index.values.tolist()

#Remove duplicate rows
features = features.groupby(level=0, axis=0).sum()
#Select features which match the target indicies
features = features.loc[target_indicies]


In [122]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np
from sklearn.model_selection import train_test_split

#10-fold cross validator
kf = KFold(n_splits = 10)

#Create an SVM classifier
svm = SVC(kernel='rbf', C=20.0, gamma='scale')

#List the results of the 10-fold CV
f1_scores = []
prec_scores = []
rec_scores = []

labels = targets2.to_list()

for k, (train, test) in enumerate(kf.split(features, labels)):
    #Fit the SVC classifier
    svm.fit(features.iloc[train], targets2.iloc[train])
    
    #Predict y
    y_predict = svm.predict(features.iloc[test])

    #Collect scores from all splits
    prec_scores.append(precision_score(targets2.iloc[test], y_predict, average='weighted'))
    rec_scores.append(recall_score(targets2.iloc[test], y_predict, average='weighted'))
    f1_scores.append(f1_score(targets2.iloc[test], y_predict, average='weighted'))
    
    #Print confusion matrix and classification report
    print(confusion_matrix(targets2.iloc[test], y_predict))
    print(classification_report(targets2.iloc[test], y_predict, target_names=["Independent", "Dependent"]))
    
#Print mean scores from all splits
print("Mean precision score after 10-fold cross validation: {}".format(np.mean(prec_scores)))
print("Mean recall score after 10-fold cross validation: {}".format(np.mean(rec_scores)))
print("Mean F1 score after 10-fold cross validation: {}".format(np.mean(f1_scores)))


[[43 10]
 [20 10]]
              precision    recall  f1-score   support

 Independent       0.68      0.81      0.74        53
   Dependent       0.50      0.33      0.40        30

    accuracy                           0.64        83
   macro avg       0.59      0.57      0.57        83
weighted avg       0.62      0.64      0.62        83

[[46  9]
 [18 10]]
              precision    recall  f1-score   support

 Independent       0.72      0.84      0.77        55
   Dependent       0.53      0.36      0.43        28

    accuracy                           0.67        83
   macro avg       0.62      0.60      0.60        83
weighted avg       0.65      0.67      0.66        83

[[35  8]
 [22 17]]
              precision    recall  f1-score   support

 Independent       0.61      0.81      0.70        43
   Dependent       0.68      0.44      0.53        39

    accuracy                           0.63        82
   macro avg       0.65      0.62      0.62        82
weighted avg     

In [135]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

nb = GaussianNB()
rf = RandomForestClassifier()

nb.fit(features, targets2)
rf.fit(features, targets2)

test_targets = targets.drop(targets2.index)
#Remove duplicate rows
test_features = original_features.groupby(level=0, axis=0).sum()
#Select features which match the target indicies
test_features = test_features.loc[test_targets.index]

nb_predicted = nb.predict(test_features)
rf_predicted = rf.predict(test_features)


In [186]:
print(nb_predicted)
print(rf_predicted)

matching_indices = [np.where(nb_predicted == rf_predicted)]
nb_zeroes = [np.where(nb_predicted == 0)]
nb_ones = [np.where(nb_predicted == 1)]

matching_zeroes = [element for element in matching_indices[0][0] if element in nb_zeroes[0][0]]

matching_ones = [element for element in matching_indices[0][0] if element in nb_ones[0][0]]

smaller_array_size = min(len(matching_zeroes), len(matching_ones))
samples_to_add = min(smaller_array_size, len(labels)) / 2
print(samples_to_add)

[1 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
411.0


In [212]:
new_matching_zeroes = matching_zeroes[:(int(samples_to_add) + 1)]
new_matching_ones = matching_ones[:(int(samples_to_add) + 1)]
new_indices = original_features.index.to_list()

independent_indices = [new_indices[index] for index in new_matching_zeroes]
dependent_indices = [new_indices[index] for index in new_matching_ones]

all_new_indices = np.sort(independent_indices + dependent_indices)

new_features = original_features.groupby(level=0, axis=0).sum()
new_features = new_features.loc[all_new_indices]

new_labels = []
for index in all_new_indices:
    if index in independent_indices:
        new_labels.append(0)
    else:
        new_labels.append(1)

In [223]:


updated_features = pd.concat([features, new_features])

#10-fold cross validator
kf = KFold(n_splits = 10)

#Create an SVM classifier
svm = SVC(kernel='rbf', C=20.0, gamma='scale')

#List the results of the 10-fold CV
f1_scores = []
prec_scores = []
rec_scores = []

updated_labels = labels + new_labels
updated_labels = np.array(updated_labels)

for k, (train, test) in enumerate(kf.split(updated_features, updated_labels)):
    #Fit the SVC classifier
    svm.fit(updated_features.iloc[train], updated_labels[train])
    
    #Predict y
    y_predict = svm.predict(updated_features.iloc[test])

    #Collect scores from all splits
    prec_scores.append(precision_score(updated_labels[test], y_predict, average='weighted'))
    rec_scores.append(recall_score(updated_labels[test], y_predict, average='weighted'))
    f1_scores.append(f1_score(updated_labels[test], y_predict, average='weighted'))
    
    #Print confusion matrix and classification report
    print(confusion_matrix(updated_labels[test], y_predict))
    print(classification_report(updated_labels[test], y_predict, target_names=["Independent", "Dependent"]))
    
#Print mean scores from all splits
print("Mean precision score after 10-fold cross validation: {}".format(np.mean(prec_scores)))
print("Mean recall score after 10-fold cross validation: {}".format(np.mean(rec_scores)))
print("Mean F1 score after 10-fold cross validation: {}".format(np.mean(f1_scores)))

[[44 63]
 [21 37]]
              precision    recall  f1-score   support

 Independent       0.68      0.41      0.51       107
   Dependent       0.37      0.64      0.47        58

    accuracy                           0.49       165
   macro avg       0.52      0.52      0.49       165
weighted avg       0.57      0.49      0.50       165

[[35 47]
 [35 48]]
              precision    recall  f1-score   support

 Independent       0.50      0.43      0.46        82
   Dependent       0.51      0.58      0.54        83

    accuracy                           0.50       165
   macro avg       0.50      0.50      0.50       165
weighted avg       0.50      0.50      0.50       165

[[39 41]
 [36 49]]
              precision    recall  f1-score   support

 Independent       0.52      0.49      0.50        80
   Dependent       0.54      0.58      0.56        85

    accuracy                           0.53       165
   macro avg       0.53      0.53      0.53       165
weighted avg     