In [268]:
import pandas as pd

trg = pd.read_csv("H:\\COMPSCI 361\\trg.csv", index_col = 0)

# split each abstract value into a list:
for i in range(len(trg)):
    trg.iloc[i,1] = trg.iloc[i,1].split()
    
training = trg.copy()

In [207]:
def get_features(dataset):
    
    # split the trg dataframe into 4 dataframes for each class
    df_groupby_classes = [v for k, v in trg.groupby('class')] # list of dataframes
    
    # remove useless words:
    remove_more = ['with', 'that', 'from', 'were', 'which', 'that', 'have', 'these', 'been', 'other', 'the', 'this', 'found', 'more', 'three', 'also', 'only', 'open', 'there']
    for X_df in df_groupby_classes: # X_df is a single a dataframe
        for i in range(len(X_df)): # range(len(X_df)) is an int
            for word in X_df.iloc[i,1]: 
                if len(word) < 4 or word in remove_more:
                    X_df.iloc[i,1].pop(X_df.iloc[i,1].index(word))

    # get frequency of words in each classes
    word_count_per_classes = [dict(),dict(),dict(),dict()]
    for n in range(4):
        for i in range(len(df_groupby_classes[n])): # this is the length of the nth class dataframe
            for each in df_groupby_classes[n].iloc[i,1]: # each word in the ith row of the nth dataframe
                if each not in word_count_per_classes[n].keys(): #  key of the nth dictionary
                    word_count_per_classes[n][each] = 1
                else:
                    word_count_per_classes[n][each] +=1
                    
    # get the n most commonly occurring words for each of the classes
    words_class_sorted = [[],[],[],[]]
    for n in range(4):
        words_class_sorted[n] = sorted(word_count_per_classes[n].items(), key=lambda x: x[1], reverse= True)
        
    most_common_words_by_class = [[],[],[],[]]
    for n in range(4):
        end = int(len(words_class_sorted[n])*0.33)
        for i in range(end):
            most_common_words_by_class[n].append(words_class_sorted[n][0:end][i][0])
            
    # use sets to remove common words that are in all four dictionaries and therefore get a list of features that are composed of only unique words from each class type:
    a,b,e,v = set(most_common_words_by_class[0]), set(most_common_words_by_class[1]), set(most_common_words_by_class[2]), set(most_common_words_by_class[3])
    features = list((a|b|e|v) - ((a&b)|(a&e)|(a&v)|(b&e)|(b&v)|(e&v)) - ((a&b&e)|(a&b&v)|(b&e&v)) - (a&b&e&v))
    # = list(((a-(b|e|v))|(b-(a|e|v))|(e-(a|b|v))|(v-(a|b|e)))|((a&b&e)|(a&b&v)|(b&e&v)|(a&e&v)))
    if "class" in features:
        features.remove("class")
        
    return features
    

In [190]:
features = get_features(training)

In [280]:
# cross_validate will eventually return the result
def cross_validate(training, features, folds):
    start = 0
    end = int(len(training)/folds)
    for i in range(folds):
        Y_class = training[start:end].drop('abstract',1)
        Y_train = training[start:end].drop('class',1)
        X_train = training.drop(Y_train.index, inplace=False)
        
        df = X_train.copy()
        for x in features:
            df[x] = 0
        df = df.drop('abstract',1)
        
        for i in range(len(X_train)):
            for word in X_train.iloc[i,1]:
                if word in features:
                    colnumber = features.index(word) + 1
                    df.iloc[i,colnumber] += 1
                    
        class_priors = {'A':0, 'B':0, 'E':0, 'V':0}
        total_class_frequencies = df['class'].value_counts(sort = False)
        for key in class_priors:
            class_priors[key] = (total_class_frequencies[key]+1)/(sum(total_class_frequencies)+4)
        
        sorted_df = [v for k, v in df.groupby('class')]
        sorted_A = sorted_df[0]
        sorted_B = sorted_df[1]
        sorted_E = sorted_df[2]
        sorted_V = sorted_df[3]

        # P(xi|A):
        xgivenA = {}
        total_word_count_xgivenA = len(features)
        for x in features:
            xgivenA[x] = sum(sorted_A[x])+1
            total_word_count_xgivenA += xgivenA[x]

        for x in xgivenA:
            xgivenA[x] = xgivenA[x] / (total_word_count_xgivenA+1)

        # P(xi|B):
        xgivenB = {}
        total_word_count_xgivenB = len(features)
        for x in features:
            xgivenB[x] = sum(sorted_B[x])+1
            total_word_count_xgivenB += xgivenB[x]

        for x in xgivenA:
            xgivenB[x] = xgivenB[x] / (total_word_count_xgivenB+1)

        # P(xi|E):
        xgivenE = {}
        total_word_count_xgivenE = len(features)
        for x in features:
            xgivenE[x] = sum(sorted_E[x])+1
            total_word_count_xgivenE += xgivenE[x]

        for x in xgivenE:
            xgivenE[x] = xgivenE[x] / (total_word_count_xgivenE+1)

        # P(xi|V):
        xgivenV = {}
        total_word_count_xgivenV = len(features)
        for x in features:
            xgivenV[x] = sum(sorted_V[x])+1
            total_word_count_xgivenV += xgivenV[x]

        for x in xgivenV:
            xgivenV[x] = xgivenV[x] / (total_word_count_xgivenV+1)
            
        import math

        # caluclate the probability that example is some class Ck given that we have the set of counts:

        def get_priori(word):
            priori_A = xgivenA[word]
            priori_B = xgivenB[word]
            priori_E = xgivenE[word]
            priori_V = xgivenV[word]
            return priori_A, priori_B, priori_E, priori_V

        def get_multiplicative_total_probability_of_x_given_class(x):
            # x = {x1, ... , xn}
            total_A = class_priors['A']
            total_B = class_priors['B']
            total_E = class_priors['E']
            total_V = class_priors['V']
            for xi in x:
                if xi in features:
                    total_A *= xgivenA[xi]
                    total_B *= xgivenB[xi]
                    total_E *= xgivenE[xi]
                    total_V *= xgivenV[xi]
            return math.log(total_A),math.log(total_B), math.log(total_E), math.log(total_V)

        predicted = []
        for i in range(len(Y_train)):
            classes_list = ['A','B','E','V']
            for x in Y_train.iloc[i,0:]:
                pa,pb,pe,pv = get_multiplicative_total_probability_of_x_given_class(x)
                predicted.append(classes_list[([pa,pb,pe,pv].index(max(pa,pb,pe,pv)))])
                
        # calculate the error of my predictions against the training set
        error = 0 
        for each in list(zip(predicted,Y_class)):
            if each[0] != each[1]:
                error +=1

        print(error)
        print(100 - error/len(Y_class)*100)
        start = end
        end += int(len(training)/folds)

In [None]:
cross_validate(training,features,2)

1
99.95
