In [1]:
# Standard Naive Bayes Classifier:

import pandas as pd

df_train = pd.read_csv('trg.csv', sep=',',)
df_train = df_train.rename(columns={'id': 'ID', 'class': 'CLASS','abstract': 'ABSTRACT'})

print(df_train.shape)
df_train.head()

df_train['CLASS'].value_counts(normalize=True)


# Split data set.
df_training = df_train

print(df_training.shape)


# cleaning
df_training['CLASS'].value_counts(normalize=True)

df_training.head(3)

df_training['ABSTRACT'] = df_training['ABSTRACT'].str.replace(
   '\W', ' ') 
df_training['ABSTRACT'] = df_training['ABSTRACT'].str.lower()
df_training.head(3)

df_training['ABSTRACT'] = df_training['ABSTRACT'].str.split()
voca_list = []
for line in df_training['ABSTRACT']:
    for string in line:
        voca_list.append(string)

voca_list = list(set(voca_list))
len(voca_list)
string_counts = {string: [0] * len(df_training['ABSTRACT']) for string in voca_list}

for i, msg in enumerate(df_training['ABSTRACT']):
    for string in msg:
        string_counts[string][i] += 1
string_counts = pd.DataFrame(string_counts)
string_counts.head()
df_training_clean = pd.concat([df_training, string_counts], axis=1)
df_training.head()

# setting estimates
e_CLASS = df_training_clean[df_training_clean['CLASS'] == 'E']
b_CLASS = df_training_clean[df_training_clean['CLASS'] == 'B']
a_CLASS = df_training_clean[df_training_clean['CLASS'] == 'A']
v_CLASS = df_training_clean[df_training_clean['CLASS'] == 'V']


p_e_CLASS = len(e_CLASS) / len(df_training_clean)
p_b_CLASS= len(b_CLASS) / len(df_training_clean)
p_a_CLASS = len(a_CLASS) / len(df_training_clean)
p_v_CLASS = len(v_CLASS) / len(df_training_clean)

words_in_e_CLASS = e_CLASS['CLASS'].apply(len)
n_e = words_in_e_CLASS.sum()

words_in_b_CLASS = b_CLASS['CLASS'].apply(len)
n_b = words_in_b_CLASS.sum()

words_in_a_CLASS = a_CLASS['CLASS'].apply(len)
n_a = words_in_a_CLASS.sum()

words_in_v_CLASS = v_CLASS['CLASS'].apply(len)
n_v = words_in_v_CLASS.sum()

n_voca = len(voca_list)

Laplace_Smoothing = 1

estimates_e = {word:0 for word in voca_list}
estimates_b = {word:0 for word in voca_list}
estimates_a = {word:0 for word in voca_list}
estimates_v = {word:0 for word in voca_list}

# Calculate estimates
for string in voca_list:
    words_given_e = e_CLASS[string].sum() 
    p_word_given_e = (words_given_e + Laplace_Smoothing) / (n_e + Laplace_Smoothing*n_voca)
    estimates_e[string] = p_word_given_e

    words_given_b = b_CLASS[string].sum() 
    p_word_given_b = (words_given_b + Laplace_Smoothing) / (n_b + Laplace_Smoothing*n_voca)
    estimates_b[string] = p_word_given_b
    
    words_given_a = a_CLASS[string].sum() 
    p_word_given_a = (words_given_a + Laplace_Smoothing) / (n_a + Laplace_Smoothing*n_voca)
    estimates_a[string] = p_word_given_a
    
    words_given_v = v_CLASS[string].sum() 
    p_word_given_v = (words_given_v + Laplace_Smoothing) / (n_v + Laplace_Smoothing*n_voca)
    estimates_v[string] = p_word_given_v   


# train the model.
import re
import math
def classification(abstract):

    abstract = re.sub('\W', ' ', abstract)
    abstract = abstract.lower().split()

    p_e_given_abstract = math.log(p_e_CLASS)
    p_b_given_abstract = math.log(p_b_CLASS)
    p_a_given_abstract = math.log(p_a_CLASS)
    p_v_given_abstract = math.log(p_v_CLASS)

    for string in abstract:
        if string in estimates_e:
            p_e_given_abstract += math.log(estimates_e[string])

        if string in estimates_b:
            p_b_given_abstract += math.log(estimatess_b[string])
            
        if string in estimates_a:
            p_a_given_abstract += math.log(estimates_a[string])
            
        if string in estimates_v:
            p_v_given_abstract += math.log(estimates_v[string])

    print('P(E|abstract):', p_e_given_abstract)
    print('P(B|abstract):', p_b_given_abstract)
    print('P(A|abstract):', p_a_given_abstract)
    print('P(V|abstract):', p_v_given_abstract)

    if p_e_given_abstract > p_b_given_abstract and p_e_given_abstract > p_a_given_abstract and p_e_given_abstract > p_v_given_abstract:
        print('CLASS: E')
    elif p_b_given_abstract > p_e_given_abstract and p_b_given_abstract > p_a_given_abstract and p_b_given_abstract > p_v_given_abstract:
        print('CLASS: B')
    elif p_a_given_abstract > p_e_given_abstract and p_a_given_abstract > p_b_given_abstract and p_a_given_abstract > p_v_given_abstract:
        print('CLASS: A')
    elif p_v_given_abstract > p_e_given_abstract and p_v_given_abstract > p_b_given_abstract and p_v_given_abstract > p_a_given_abstract:
        print('CLASS: V')
    else:
        print('classify')

# test the model.

import re
import math
def test_classification(abstract):
   
    abstract = re.sub('\W', ' ', abstract)
    abstract = abstract.lower().split()

    p_e_given_abstract = math.log(p_e_CLASS)
    p_b_given_abstract = math.log(p_b_CLASS)
    p_a_given_abstract = math.log(p_a_CLASS)
    p_v_given_abstract = math.log(p_v_CLASS)

    for string in abstract:
        if string in estimates_e:
            p_e_given_abstract += math.log(estimates_e[string])

        if string in estimates_b:
            p_b_given_abstract += math.log(estimates_b[string])
            
        if string in estimates_a:
            p_a_given_abstract += math.log(estimates_a[string])
            
        if string in estimates_v:
            p_v_given_abstract += math.log(estimates_v[string])

    if p_e_given_abstract > p_b_given_abstract and p_e_given_abstract > p_a_given_abstract and p_e_given_abstract > p_v_given_abstract:
        return ('E')
    elif p_b_given_abstract > p_e_given_abstract and p_b_given_abstract > p_a_given_abstract and p_b_given_abstract > p_v_given_abstract:
        return ('B')
    elif p_a_given_abstract > p_e_given_abstract and p_a_given_abstract > p_b_given_abstract and p_a_given_abstract > p_v_given_abstract:
        return ('A')
    elif p_v_given_abstract > p_e_given_abstract and p_v_given_abstract > p_b_given_abstract and p_v_given_abstract > p_a_given_abstract:
        return ('V')
    else:
        return 'classify'


df_test= pd.read_csv('trg.csv', sep=',')
df_test = df_test.rename(columns={'id': 'ID', 'class': 'CLASS','abstract': 'ABSTRACT'})

df_test['predicted'] = df_test['ABSTRACT'].apply(test_classification)
df_test.head()


count = 0
length = df_test.shape[0]

for line in df_test.iterrows():
    line = line[1]
    if line['CLASS'] == line['predicted']:
        count += 1

print('Standard Naive Bayes:')
print('TP + TN:', count)
print('FP + FN:', length - count)
print('Accuracy:', count/length)

(4000, 3)
(4000, 3)


  df_training['ABSTRACT'] = df_training['ABSTRACT'].str.replace(


Standard Naive Bayes:
TP + TN: 3535
FP + FN: 465
Accuracy: 0.88375


In [2]:
import pandas as pd
import numpy as np
import imblearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

df_train = pd.read_csv('trg.csv', sep=',')
df_train = df_train.rename(columns={'id': 'ID', 'class': 'CLASS','abstract': 'ABSTRACT'})

In [3]:
df_train['CLASS'].value_counts()

E    2144
B    1602
A     128
V     126
Name: CLASS, dtype: int64

In [4]:
# Improved Naive Bayes Classifier: we try the best out 10 random sampling model.
for i in range(1,11):
    
    # we use oversample to balanced the data set.
    max_size = df_train['CLASS'].value_counts().max()
    lst = [df_train]
    for class_index, group in df_train.groupby('CLASS'):
        lst.append(group.sample(max_size-len(group), replace=True))
    frame_new = pd.concat(lst)

    df_training = frame_new
    df_training['CLASS'].value_counts(normalize=True)
    df_training.head(3)
    
    # we clean the data set removing punctuation and transform in lowercase.
    df_training['ABSTRACT'] = df_training['ABSTRACT'].str.replace(
       '\W', ' ') 
    df_training['ABSTRACT'] = df_training['ABSTRACT'].str.lower()
    df_training.head() 

    # we create a vocabulary of words excluding digits and common words for both the vocabulary and the data set.
    text = df_training['ABSTRACT']
    voca_list = []
    new_train = []
    for line in text:
        words = [string for string in line.split() if string.lower() not in ENGLISH_STOP_WORDS and not string.isdigit()]
        new_text = " ".join(words)
        new_train.append(new_text)
        for string in new_text.split():
            voca_list.append(string)     
    voca_list = list(set(voca_list))

    # we replace the new text into the dataset.
    df_new_text = pd.DataFrame(new_train, columns =['ABSTRACT'])
    df_training = df_training[['ID','CLASS']]
    df_training['ABSTRACT'] = df_new_text

    df_training['ABSTRACT'] = df_training['ABSTRACT'].str.split()
    word_counts = {string: [0] * len(df_training['ABSTRACT']) for string in voca_list}

    for i, msg in enumerate(df_training['ABSTRACT']):
        for string in msg:
            word_counts[string][i] += 1
    word_counts = pd.DataFrame(word_counts)
    df_training_clean = pd.concat([df_training.reset_index(drop=True), word_counts.reset_index(drop=True)], axis=1, sort=False)

    # Group the data by CLass
    e_CLASS = df_training_clean[df_training_clean['CLASS'] == 'E']
    b_CLASS = df_training_clean[df_training_clean['CLASS'] == 'B']
    a_CLASS = df_training_clean[df_training_clean['CLASS'] == 'A']
    v_CLASS = df_training_clean[df_training_clean['CLASS'] == 'V']

    # Calculating the probabilities.
    p_e_CLASS = len(e_CLASS) / len(df_training_clean)
    p_b_CLASS= len(b_CLASS) / len(df_training_clean)
    p_a_CLASS = len(a_CLASS) / len(df_training_clean)
    p_v_CLASS = len(v_CLASS) / len(df_training_clean)

    # counting each class.
    words_in_e_CLASS = e_CLASS['CLASS'].apply(len)
    n_e = words_in_e_CLASS.sum()
    words_in_b_CLASS = b_CLASS['CLASS'].apply(len)
    n_b = words_in_b_CLASS.sum()
    words_in_a_CLASS = a_CLASS['CLASS'].apply(len)
    n_a = words_in_a_CLASS.sum()
    words_in_v_CLASS = v_CLASS['CLASS'].apply(len)
    n_v = words_in_v_CLASS.sum()

    # number of word in the vocabulary
    n_voca = len(voca_list)

    # Laplace smoothing
    Laplace_Smoothing = 1

    # Initialize the estimates.
    estimates_e = {word:0 for word in voca_list}
    estimates_b = {word:0 for word in voca_list}
    estimates_a = {word:0 for word in voca_list}
    estimates_v = {word:0 for word in voca_list}

    # Calculating estimates.
    for string in voca_list:
        words_given_e = e_CLASS[string].sum() 
        p_word_given_e = (words_given_e + Laplace_Smoothing) / (n_e + Laplace_Smoothing*n_voca)
        estimates_e[string] = p_word_given_e

        words_given_b = b_CLASS[string].sum() 
        p_word_given_b = (words_given_b + Laplace_Smoothing) / (n_b + Laplace_Smoothing*n_voca)
        estimates_b[string] = p_word_given_b

        words_given_a = a_CLASS[string].sum() 
        p_word_given_a = (words_given_a + Laplace_Smoothing) / (n_a + Laplace_Smoothing*n_voca)
        estimates_a[string] = p_word_given_a

        words_given_v = v_CLASS[string].sum() 
        p_word_given_v = (words_given_v + Laplace_Smoothing) / (n_v + Laplace_Smoothing*n_voca)
        estimates_v[string] = p_word_given_v   
    
    # function to calculate the probabilities.
    import re
    import math
    def test_classification(abstract):
   
        abstract = re.sub('\W', ' ', abstract)
        abstract = abstract.lower().split()

        p_e_given_abstract = math.log(p_e_CLASS)
        p_b_given_abstract = math.log(p_b_CLASS)
        p_a_given_abstract = math.log(p_a_CLASS)
        p_v_given_abstract = math.log(p_v_CLASS)

        for word in abstract:
            if word in estimates_e:
                p_e_given_abstract += math.log(estimates_e[word])

            if word in estimates_b:
                p_b_given_abstract += math.log(estimates_b[word])

            if word in estimates_a:
                p_a_given_abstract += math.log(estimates_a[word])

            if word in estimates_v:
                p_v_given_abstract += math.log(estimates_v[word])

        if p_e_given_abstract > p_b_given_abstract and p_e_given_abstract > p_a_given_abstract and p_e_given_abstract > p_v_given_abstract:
            return ('E')
        elif p_b_given_abstract > p_e_given_abstract and p_b_given_abstract > p_a_given_abstract and p_b_given_abstract > p_v_given_abstract:
            return ('B')
        elif p_a_given_abstract > p_e_given_abstract and p_a_given_abstract > p_b_given_abstract and p_a_given_abstract > p_v_given_abstract:
            return ('A')
        elif p_v_given_abstract > p_e_given_abstract and p_v_given_abstract > p_b_given_abstract and p_v_given_abstract > p_a_given_abstract:
            return ('V')
        else:
            return 'classify'

    
    # calculating the acurracy for the training set.
    
    df_test= pd.read_csv('trg.csv', sep=',',)
    df_test = df_test.rename(columns={'id': 'ID', 'class': 'CLASS','abstract': 'ABSTRACT'})
    df_test['predicted'] = df_test['ABSTRACT'].apply(test_classification)

    count = 0
    length = df_test.shape[0]

    for line in df_test.iterrows():
        line = line[1]
        if line['CLASS'] == line['predicted']:
            count += 1
        
    print('Test number'+ str(i))
    print('TP + TN:', count)
    print('FP + FN:', length - count)
    print('Accuracy:', count/length)
    
    # predicting values for the test set.
    
    df_test= pd.read_csv('tst.csv', sep=',',)
    df_test = df_test.rename(columns={'id': 'ID', 'class': 'CLASS','abstract': 'ABSTRACT'})
    df_test['predicted'] = df_test['ABSTRACT'].apply(test_classification)
    filename = 'prediction' + str(i) + '.csv'

    #exporting the prediction to a csv file.
    
    df_test.to_csv(filename)

  df_training['ABSTRACT'] = df_training['ABSTRACT'].str.replace(


Test number8575
TP + TN: 3961
FP + FN: 39
Accuracy: 0.99025


KeyboardInterrupt: 