In [1]:
#Imports and libraries 
import numpy as np
import pandas as pd
import math
import csv
import re

In [2]:
#Loading Training Data
path = "trg.csv"

#Not Pandas
with open(path) as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    data = list(readCSV)
    headers = data[0]
    data = data[1:]
    data = np.array(data)
    
path = "tst.csv"
with open(path) as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    dataTesting = list(readCSV)
    headersTesting = dataTesting[0]
    dataTesting = dataTesting[1:]
    dataTesting = np.array(dataTesting)


In [3]:
#Preprocessing - NO PANDAS
#Format - Uppercase letters
for row in data:
    row[2]=row[2].upper()
    
#Remove negavite numbers and special characters (')
for row in data:
    row[2]=re.sub(r"\s(-*\d*)*\s",' ',row[2])
    row[2]=re.sub(r"[']",' ',row[2])

#Split abstract test by words
index = 0
abstracts = {}
for row in data:
    abstracts[index] = np.array(row[2].split(' '))
    index += 1
    
#Remove common words and sort the array
#Function to remove common words
def removeCommon(words):
    #List of common words
    remove = ['THE','OF','A','AN','FOR','THAT','WITH','BY','AND',
          'OR','IS','ARE','BY','WAS','WERE','IT','ITS', 'TO',
             'WHICH', 'IN', 'HAVE', 'HAS', 'NO','NOT','AS', 'ALSO']
    
    #Loop to remove common words & numeric values in the abstract
    control=0
    while control<len(words):
        if words[control] in remove:
            words[control]="-DELETE-"
        control += 1
    words = np.delete(words,np.argwhere(words=='-DELETE-')[:,0])
    
    #Return the sorted array    
    words = np.sort(words)
    return words

#Words Collector
# Identify different words in the abstracts 
# and count number of abstracts in which their appears (frequency in data instances - NOT Individual)
def wordCollector(column):
    globalWords = {}
    for text in column:
        rowWords = {}
        for new in column[text]:
            if not new in rowWords.keys():
                rowWords[new]=1;
        for word in rowWords.keys():
            if word in globalWords.keys():
                globalWords[word] += 1
            else:
                globalWords[word] = 1
    return globalWords

#Remove Outliers
def removeUnique(allWords, limit=1):
    words = list(allWords)
    for word in words:
        if allWords[word]<=limit:
            allWords.pop(word)
    return allWords       

#Calculate probability of each class in the given class column of the dataset
def classProbability(classes):
    classProbs = {}
    uniqueClass = np.unique(classes)
    for c in uniqueClass:
        classProbs[c] = np.count_nonzero(classes==c)/classes.size
    return classProbs

#Function to group all preprocessing steps
def preprocessing(data, unlabel=False):
    if unlabel==True:
        index=1
    else:
        index=2
    #Format - Uppercase letters
    for row in data:
        row[index]=row[index].upper()
        
    #Remove negavite numbers and special characters (')
    for row in data:
        row[index]=re.sub(r"\s(-*\d*)*\s",' ',row[index])
        row[index]=re.sub(r"[']",' ',row[index])
        
    #Split abstract string
    num = 0
    abstracts = {}
    for row in data:
        abstracts[num] = np.array(row[index].split(' '))
        num += 1
        
    #RemoveCommon words
    for element in abstracts:
        abstracts[element] = removeCommon(abstracts[element])
        
    return abstracts

In [4]:
#Naive Bayes Model - NO PANDAS

#General Functions

#Calculate count of word of instances for a given class
#target = class to count words
#data = dataset - abstracts
#instances = index of instances of the target class
def classWords(target, data, instances):
    wordFrequency = {}
    cWords = {}
    cWords['total']=0
    for row in instances:
        wordsRow = np.unique(data[row])
        for word in data[row]:
            if not word in cWords.keys():
                cWords[word] = 1
                wordFrequency[word]=1
                wordsRow = np.delete(wordsRow,np.where(wordsRow==word)[0])
            else:
                cWords[word] += 1
                if word in wordsRow:
                    wordFrequency[word]+=1
                    wordsRow = np.delete(wordsRow,np.where(wordsRow==word)[0])
            cWords['total']+=1
    
    wordFrequency = removeUnique(wordFrequency, 1)
    wordFrequency['total']=cWords['total']
    return cWords, wordFrequency

#Calculate the prior probabilities of an instances for all classe
#words = Vocabulary of the dataset
#instance = instances | row evaluated
#targets = predictable classes
#targetwords = count of word of all instances of the same class
def conditionalProb(words, instances, targets, targetWords):
    classPriorProbs = {}
    
    #Data Vocabulary (Count of unique words)
    wordCount = len(words)
    abstractWords = np.unique(instances)
    
    # Evaluation per class
    for target in targets:        
        classPriorProbs[target] = {}
        totalClassWords = targetWords[target]['total']
        temp = targetWords[target].pop('total')

        #Evaluation of each word in the instances        
        for word in abstractWords:
            if not word in targetWords[target].keys():
                classPriorProbs[target][word] = 1 / (totalClassWords + wordCount)
            else:
                classPriorProbs[target][word] = \
                (targetWords[target][word] + 1 ) / (totalClassWords + wordCount)
        targetWords[target]['total']=temp
    return classPriorProbs

#Classifier - Calculate class probability of and instance
#instance = instance evaluated
#condProb = conditional probabilities - classPriorProbs - prob of word for each class
#priorProb = Prior probabilities for each class
def classProb(instance, condProbs, priorProb):
    postProb = {}
    words = {}
    
    # Count unique word of an instance
    rowWords = np.unique(instance)
    for word in rowWords:        
        words[word] = np.count_nonzero(instance==word)
        
    # Count unique words in a class
    for x in priorProb.keys():
        postProb[x] = math.log(priorProb[x])
        
        # Calculate posterior probability of instance
        for word in words:
            postProb[x] += words[word]*math.log(condProbs[x][word])   
    return postProb

#Performances - Asses acurracy of model
#predictions - Array with predicted classes
#real - array of data with target classes from the original dataset (id, class, abstract)
def modelPerformance(predictions, real):
    valid=0;
    total = len(predictions)
    for prediction in range(len(predictions)):
        if predictions[prediction]==real[prediction][1]:
            valid += 1
    performance=valid/total
    return performance
   

In [5]:
#Cross-Validation Test
folds = 10
samplesData = {}
cycle = 0
performance = np.zeros(folds)

#Split data 
#Creating Folds
foldSize= int(len(data)/folds)
initialSample = 0
finalSample = foldSize
for fold in range(folds):
    if fold == folds:
        samplesData[fold] = data[initialSample:,0:3]
    else:
        samplesData[fold] = data[initialSample:finalSample,0:3]
        initialSample = finalSample
        finalSample += foldSize
        
#Evaluate model performances with cross-validation
for sample in range(folds):
    trainingData = []   # id, class
    print("Fold: " + str(sample))
    testData = samplesData[sample]     #id, class, abstracts
    for trainingSample in samplesData:        
        if sample!=trainingSample:
            for x in range(len(samplesData[trainingSample])):
                trainingData.append(samplesData[trainingSample][x])

    trainingData = np.array(trainingData)
    
    #Determine Instances
    numInstances = len(trainingData)
    print("Fold lenght:" + str(numInstances))
    
    #Preprocessing training Data
    training = preprocessing(trainingData)
    test = preprocessing(testData)
    
    #Select training
    #training = abstracts
    
    #Determine classes
    classes = np.unique(trainingData[:,1])
    
    #Determine class probabilities
    classProbs = classProbability(trainingData[:,1])
    
    #Calculate words frequency
    allWords = wordCollector(training)
    print("Total number of words:" + str(len(allWords)))
    
    allWords = removeUnique(allWords, 1)
    print("Total number of words after remove unique :" + str(len(allWords)))
    
    #Create and array with all words
    words = list(allWords)
    words.sort()
    
    #Get words count of instances of each class value
    classesWords = {}
    frequentWords = {}
    for c in classes:
        classesWords[c], frequentWords[c]= classWords(c, training, np.where(trainingData==c)[0])
   
    testSamples=len(test)
    predictions=[]
    for elem in range(len(test)):
        
        #Get conditional probabilities of rows
        condProbs = conditionalProb(words,test[elem],classes,classesWords)
        
        #Classifier - Selecting a class
        classifier = classProb(test[elem], condProbs, classProbs)
        
        #Select final classification
        predictions.append(max(classifier.keys(), key=(lambda k: classifier[k])))
    
    #Measure fold performance
    performance[sample]= modelPerformance(predictions,testData)
    averagePerformance = sum(performance)/len(performance)
print(averagePerformance)

    

Fold: 0
Fold lenght:3600
Total number of words:27651
Total number of words after remove unique :12358
Fold: 1
Fold lenght:3600
Total number of words:27678
Total number of words after remove unique :12279
Fold: 2
Fold lenght:3600
Total number of words:27584
Total number of words after remove unique :12183
Fold: 3
Fold lenght:3600
Total number of words:27725
Total number of words after remove unique :12297
Fold: 4
Fold lenght:3600
Total number of words:27762
Total number of words after remove unique :12237
Fold: 5
Fold lenght:3600
Total number of words:27921
Total number of words after remove unique :12317
Fold: 6
Fold lenght:3600
Total number of words:27946
Total number of words after remove unique :12314
Fold: 7
Fold lenght:3600
Total number of words:27873
Total number of words after remove unique :12266
Fold: 8
Fold lenght:3600
Total number of words:27811
Total number of words after remove unique :12328
Fold: 9
Fold lenght:3600
Total number of words:27781
Total number of words after r

In [6]:
#Generate Solution and Export to CSV
with open('trainingResults.csv', mode='w', newline='') as csv_file:
    fieldnames = headers
    writer = csv.writer(csv_file)


    writer.writerow([headers[0],headers[1],"Real"])
    for index in range(len(predictions)):
        writer.writerow([index+1, predictions[index],testData[index][1]])

In [7]:
#Classify test data tst.csv

test = preprocessing(dataTesting, True)

testSamples=len(test)
predictions=[]
for elem in range(len(test)):
        
    #Get conditional probabilities of rows
    condProbs = conditionalProb(words,test[elem][1],classes,classesWords)
        
    #Classifier - Selecting a class
    classifier = classProb(test[elem][1], condProbs, classProbs)
        
    #Select final classification
    predictions.append(max(classifier.keys(), key=(lambda k: classifier[k])))
    
print(predictions)


['E', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'A', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'B', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'B', 'E', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'B', 'A', 'B', 'E', 'E', 'E',

In [8]:
#Generate Solution and Export to CSV
with open('amon897.csv', mode='w', newline='') as csv_file:
    fieldnames = headers
    writer = csv.writer(csv_file)


    writer.writerow([headers[0],headers[1]])
    for index in range(len(predictions)):
        writer.writerow([index+1, predictions[index]])