p()

In [1]:
import numpy as np

In [2]:
PATH_TO_DATA = "naivebayes-21\\trg.csv"
PATH_TO_TEST = "naivebayes-21\\tst.csv"

In [3]:
from collections import defaultdict

In [4]:
list_data = []
class_freq = defaultdict(lambda: 0)
corpus = []

with open(PATH_TO_DATA) as csv_file:
    for line in csv_file:
        line = line.replace('\"', '')

        line_lst = line.split(',')
        line_lst[-1] = line_lst[-1].replace('\n', '')

        class_freq[line_lst[1]] += 1 

        corpus.append(line_lst[-1])

        list_data.append(line_lst)


full_csv_data = np.array(list_data)
class_freq.pop('class')


labels = class_freq.keys()
headers = full_csv_data[0]
data = np.asarray(full_csv_data[1:,1:])

In [5]:
from numpy.random import default_rng

ABSTRT_I = 1
LBL_I = 0

class_to_int = {lbl:i for i, lbl in enumerate(labels)}
int_to_class = {i:lbl for lbl, i in class_to_int.items()}

In [6]:
 def get_stratified_kfold_splits(data, k=10):
    instances_of_class = dict()

    # Create a dictionary with key being label, and the value being an array of instances of that class. 
    for i, label in enumerate(labels):
        instances_of_class[label] = data[data[:, 0] == label]
    
    stratified_splits = dict()
    
    # Upsample to divisible by k
    for class_label, class_instances in instances_of_class.items():
        n = len(class_instances)
        upsample_amt = k - (n % k)

        random_indices = np.random.choice(class_instances[:, ABSTRT_I], size=upsample_amt, replace=False)
        random_indices.resize((random_indices.shape[0], 2), refcheck=False)

        random_indices[:, -1] = class_label

        random_indices[:,[0, 1]] = random_indices[:,[1, 0]]

        upsampled_class_instances = np.concatenate((class_instances, random_indices), 0)
        assert len(upsampled_class_instances) % k == 0, "num examples should be divisible by k"

        stratified_splits[class_label] = np.split(upsampled_class_instances, k)

    for i in range(k):
        kth_test_lst = [stratified_splits[label][i] for label in labels]
        kth_train_lst = [stratified_splits[label][j] for label in labels for j in range(k) if j != i]

        # Check golden rule preserved
        for label in labels:
            try:
                kth_train_lst.index(stratified_splits[label][i])
                assert False, "GOLDEN RULE BROKEN!" 
            except ValueError:
                pass  
    
        kth_train_data = np.concatenate(kth_train_lst)
        kth_test_data = np.concatenate(kth_test_lst)
        np.random.shuffle(kth_train_data)
        np.random.shuffle(kth_test_data)

        yield kth_train_data, kth_test_data

In [7]:
def word_freq_k_best(train, test = None, k = 500):
    X_train, y_train = get_X_y(train)
    if type(test) != type(None):
        X_test, y_test = get_X_y(test)

    train_words_i = get_word_indexes(X_train)

    frequency_matrix_train = get_word_counts(X_train, train_words_i)
    k_best_i = select_k_best(frequency_matrix_train, corr = False, k=k)
    X_train = get_k_best(frequency_matrix_train, k_best_i)

    if type(test) != type(None):
        frequency_matrix_test = get_word_counts(X_test, train_words_i)
        X_test = get_k_best(frequency_matrix_test, k_best_i)

    if type(test) != type(None):
        return X_train, y_train, X_test, y_test
    else:
        return X_train, y_train        

In [8]:
def get_word_indexes(all_words : set):
    all_words_set = get_set_all_words(all_words)
    return {word:i for i, word in enumerate(all_words_set)}

In [9]:
def get_set_all_words(X):
    all_words = set(word for i in range(len(X)) for word in X[i].split() if word) 

    return all_words

In [10]:
def get_word_counts(abstracts, word_indexes):    

    word_frequencies_matrix = np.ones((len(abstracts), len(word_indexes)))

    for i in range(len(abstracts)):
        for word in abstracts[i].split():
            try:
                word_frequencies_matrix[i, word_indexes[word]] += 1
            except KeyError:
                # word not in training words
                pass

    return word_frequencies_matrix

In [11]:
def corr(arr, i):
    ''' Calcualtes the correlation between one column (class) and the rest for the input matrix. Credit to FBruzzesi (https://stackoverflow.com/users/12411536/fbruzzesi)'''
    mean_t = np.mean(arr, axis=0)
    std_t = np.std(arr, axis=0)

    mean_i = mean_t[i]
    std_i = std_t[i]

    mean_xy = np.mean(arr*arr[:,i][:,None], axis=0)

    corr = (mean_xy - mean_i * mean_t)/(std_i * std_t)
    return corr

In [12]:
def select_k_best(word_frequencies_matrix, corre=False, k=10):
    if corre:
        R = corr(word_frequencies_matrix, -1)
        class_correlations = abs(R)
        k_best_i = np.argpartition(class_correlations, -(k+1))[-(k+1):]
        
        # removes self column
        k_best_i = k_best_i[:-1]

    else:
        class_correlations = word_frequencies_matrix.sum(axis=0)  
        k_best_i = np.argpartition(class_correlations, -k)[-k:]
    
    return k_best_i

In [13]:
def get_k_best(freq_mat, k_best_i):
    X = freq_mat[:, k_best_i]

    return X

In [14]:
def get_X_y(combinedXy):
    return combinedXy[:, ABSTRT_I], combinedXy[:, LBL_I]

In [15]:
## TF-IDF
def abstract_to_dict(abstract):
    '''word:count for words in a particular abstract'''
    abstractDict = defaultdict(lambda: 0)
    for word in abstract.split(' '):
        if word:
            abstractDict[word] += 1 
    return abstractDict 

def termFrequency(abstractDict : dict):
    """(# of repetitions of word in a document) / (# of words in a document)"""
    termFrequencies = {}
    numWords = len(abstractDict)
    
    for word, count in abstractDict.items():
        termFrequencies[word] = count/numWords
    return termFrequencies

def get_inverse_document_frequency(abstract_dict_list : list):
    """ used to calculate the weight of rare words across all documents in the corpus
        idf(w) = log(num_docs/freq_word_all_docs)"""
    idf = defaultdict(lambda: 0)
    numAbstracts = len(abstract_dict_list)

    # calculte number of docs containing word
    for abstractDict in abstract_dict_list:
        for word, count in abstractDict.items():
            if count > 0:
                idf[word] += 1 

    from math import log10
    for word, num_docs_containing_word in idf.items():
        idf[word] = np.log(numAbstracts+1/num_docs_containing_word)+1

    return idf

def get_row_tfidf(tf, idf, word_indexes):
    ''' num occurrences of word i in doc j * log(total docs / number of documents containing i) '''
    row = np.zeros((1, len(word_indexes)))
    for word, numOccurences in tf.items():
        word_index = word_indexes[word]
        row[0, word_index] = numOccurences*idf[word] 
    return row

In [16]:
def calculate_tfidf(X):
    abstract_dict_list = []
    term_frequency_lst = []
    all_words = get_set_all_words(X)

    new_X = np.zeros((len(X), len(all_words)))
    word_indexes = {word: i for i, word in enumerate(all_words)}

    for abstract in X:
        abstract_dict = abstract_to_dict(abstract)
        abstract_dict_list.append(abstract_dict)
        term_frequency_lst.append(termFrequency(abstract_dict))

    print("converted abstracts, calculating idf")    
    idf = get_inverse_document_frequency(abstract_dict_list)

    print("calculating tfidf")
    for i,tf in enumerate(term_frequency_lst):
        new_X[i] = get_row_tfidf(tf, idf, word_indexes)

    return new_X

In [17]:

class NaiveBayes():

    def __init__(self, alpha=1):
        self.alpha = alpha

    def fit(self, X_train: np.array, y_train: np.array):
        self.num_classes = len(np.unique(y_train))
        self.num_instances, self.num_features = X_train.shape

        self.classes_to_int = {label:i for i, label in enumerate(np.unique(y_train))}
        self.int_to_classes = {i:label for label, i in self.classes_to_int.items()}

        y_train = np.asarray([self.classes_to_int[label] for label in y_train])

        # initalises log cond probability array
        self.log_cond_by_class = np.zeros((self.num_classes, self.num_features))

        # initalises total_word_count_by_class array
        self.total_word_count_by_class = np.zeros((self.num_classes, 1))

        # initialises num examples by class
        self.num_examples_in_class = np.zeros((self.num_classes, 1))

        for c in range(self.num_classes):
            # splits X into a list of arrays containing instances of a particular class
            mask = (y_train == c)
            instances_from_class = X_train[mask,:]

            word_freq_for_class = np.sum(instances_from_class, axis=0) + self.alpha
            assert 0 not in word_freq_for_class, 'word_freq_should all be > 0'

            self.total_word_count_by_class[c] = np.sum(word_freq_for_class) 
            assert 0 not in self.total_word_count_by_class[c], 'total_word_count must all be > 0'

            self.log_cond_by_class[c, :] = np.log(word_freq_for_class / self.total_word_count_by_class[c])

            self.num_examples_in_class[c] = instances_from_class.shape[0]

        total_word_count = np.sum(self.total_word_count_by_class)

        self.prior_by_class = np.log(self.num_examples_in_class / self.num_instances)

    
    def predict(self, X_test):
        num_instances = len(X_test)
        y = np.zeros(num_instances)

        for i in range(num_instances):
            p_by_class = np.copy(self.prior_by_class)

            for c in range(self.num_classes):
                for word_i in range(X_test.shape[1]):
                    log_cond_prob = self.log_cond_by_class[c][word_i]

                    freq = X_test[i,word_i]
                    p_by_class[c] += log_cond_prob * freq
        
            y[i] = np.argmax(p_by_class, axis = 0)[0]
        return np.asarray([self.int_to_classes[c] for c in y])

In [18]:
class NaiveBayesGaussian():

    def __init__(self, alpha=1):
        self.alpha = alpha

    def fit(self, X_train: np.array, y_train: np.array):
        self.num_classes = len(np.unique(y_train))
        self.num_instances, self.num_features = X_train.shape

        self.classes_to_int = {label:i for i, label in enumerate(np.unique(y_train))}
        self.int_to_classes = {i:label for label, i in self.classes_to_int.items()}

        y_train = np.asarray([self.classes_to_int[label] for label in y_train])

        # initalises log cond probability array
        self.log_cond_by_class = np.zeros((self.num_classes, self.num_features))

        # initalises total_word_count_by_class array
        self.total_word_count_by_class = np.zeros((self.num_classes, 1))

        # initialises num examples by class
        self.num_examples_in_class = np.zeros((self.num_classes, 1))

        for c in range(self.num_classes):
            # splits X into a list of arrays containing instances of a particular class
            instances_from_class = X_train[y_train == c]

            word_freq_for_class = np.sum(instances_from_class, axis=0) + self.alpha
            assert 0 not in word_freq_for_class, 'word_freq_should all be > 0'

            self.total_word_count_by_class[c] = np.sum(word_freq_for_class) 
            assert 0 not in self.total_word_count_by_class[c], 'total_word_count must all be > 0'

            self.log_cond_by_class[c, :] = np.log(word_freq_for_class / self.total_word_count_by_class[c])

            self.num_examples_in_class[c] = instances_from_class.shape[0]

        total_word_count = np.sum(self.total_word_count_by_class)

        self.prior_by_class = np.log(self.num_examples_in_class / self.num_instances)

    
    def predict(self, X_test):
        num_instances = len(X_test)
        y = np.zeros(num_instances)

        for i in range(num_instances):
            p_by_class = np.copy(self.prior_by_class)

            for c in range(self.num_classes):
                for word_i in range(X_test.shape[1]):
                    log_cond_prob = self.log_cond_by_class[c][word_i]

                    freq = X_test[i,word_i]
                    p_by_class[c] += log_cond_prob * freq
        
            y[i] = np.argmax(p_by_class, axis = 0)[0]
        return np.asarray([self.int_to_classes[c] for c in y])

In [19]:
def fit_and_test_classifier(X_train, y_train, X_test, y_test):
    clf = NaiveBayes(alpha = 1)
    clf.fit(X_train, y_train)

    predict_y = clf.predict(X_test)

    accuracy = np.count_nonzero(y_test[predict_y == y_test])/len(y_test)

    return accuracy 

In [20]:
def tfidf_get_X_y(train, test, k=None):
    X_train = train[:, ABSTRT_I]
    y_train = train[:, LBL_I]

    X_test = test[:, ABSTRT_I]
    y_test = test[:, LBL_I]

    X_train = calculate_tfidf(X_train)
    X_test = calculate_tfidf(X_test) 
    
    return X_train, y_train, X_test, y_test

In [21]:
def cv(k = 500, tfidf = False):
    stratified_data = get_stratified_kfold_splits(data)
    results = 0
    count = 0
    while True:
        try:
            train, test = next(stratified_data)
            print('-'*10, "Run {}".format(count+1), '-'*10)
            print("Calculating Word Frequencies")
            feature_fn = tfidf_get_X_y if tfidf else word_freq_k_best 
            X_train, y_train, X_test, y_test = feature_fn(train, test, k=k)

            print("Fitting and Testing.")
            accuracy = fit_and_test_classifier(X_train, y_train, X_test, y_test)
            results += accuracy
            print("Fold Accuracy: ", accuracy)
            count += 1
        except StopIteration:
            break
    
    print('-'*10, "Complete", "-"*10)
    print("Classifier Accuracy: ", results/count)

In [22]:
cv(1500, tfidf = True)

---------- Run 1 ----------
Calculating Word Frequencies
converted abstracts, calculating idf
calculating tfidf
converted abstracts, calculating idf
calculating tfidf
Fitting and Testing.


ValueError: not enough values to unpack (expected 2, got 1)

In [68]:
cv(1500, tfidf = False)

---------- Run 1 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.8258706467661692
---------- Run 2 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.7736318407960199
---------- Run 3 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.8034825870646766
---------- Run 4 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.7985074626865671
---------- Run 5 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.7711442786069652
---------- Run 6 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.8706467661691543
---------- Run 7 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.8109452736318408
---------- Run 8 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.8308457711442786
---------- Run 9 ----------
Calculating Word Frequencies
Fitting and Testing.
Fold Accuracy:  0.

KeyboardInterrupt: 

In [94]:
def get_tst_data():
    with open(PATH_TO_TEST) as csv_file:
        list_data = []
        for line in csv_file:
            line = line.replace('\"', '')

            line_lst = line.split(',')
            line_lst[-1] = line_lst[-1].replace('\n', '')

            corpus.append(line_lst[-1])

            list_data.append(line_lst)

        full_csv_data = np.array(list_data)
        X_kaggle_test = full_csv_data[1:,1:]

    return X_kaggle_test

In [95]:
def get_kaggle_predictions():
    clf = NaiveBayes(alpha = 1)

    X_train, y_train = word_freq_k_best(data, None, 2000)
    clf.fit(X_train, y_train)
    
    X_kaggle_test = get_tst_data()

    predict_y = clf.predict(X_kaggle_test)

    np.savetxt('predictions.csv', predict_y, delimiter=',')

In [96]:
get_kaggle_predictions()

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U837'), dtype('<U837')) -> dtype('<U837')