In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from scipy.special import logsumexp
# from sklearn.feature_selection import SelectKBest, chi2
from nltk.corpus import stopwords
import pickle
import os

In [5]:
# Read the csv file
training_df = pd.read_csv('resources/training.csv')

In [55]:
words = pd.read_csv('resources/vocabulary.txt', delimiter='\t', header=None)

In [19]:
scaler = MinMaxScaler()
x = training_df.iloc[:, 1:-1].values # doc vector data
y = training_df.iloc[:, -1].values # labels

x = scaler.fit_transform(x)



x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2)

In [20]:
class NaiveBayesClassifier:

    def __init__(self, alpha=1):
        self.alpha = alpha
        self.log_prior = None
        self.log_likelihood = None

    def fit(self, X, y):
        # Estimate P(Y) using MLE
        self.classes, class_counts = np.unique(y, return_counts=True)
        self.log_prior = np.log(class_counts / y.shape[0])
        # Estimate P(X|Y) using MAP with Dirichlet prior
        V = X.shape[1]
        self.log_likelihood = np.zeros((len(self.classes), V))
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            # Add alpha to each count for smoothing
            word_counts = abs(X_c.sum(axis=0) + self.alpha)
            total_counts = abs(word_counts.sum() + 1e-9)
            self.log_likelihood[i, :] = np.log(word_counts / total_counts)

    def predict(self, Xnew):
        # Calculate log-probabilities for each class
        log_probs = np.zeros(len(self.classes))
        for i, c in enumerate(self.classes):
            log_probs[i] = self.log_prior[i] + \
                (self.log_likelihood[i, :] * Xnew).sum()

        # Return the class with the highest log-probability
        return self.classes[np.argmax(log_probs)]

    def find_accuracy(original_df, pred_df):
        num_correct = 0
        index = 0
        for i, doc in pred_df.iterrows():
            if (doc['pred'] == original_df.iloc[index, -1]):
                num_correct += 1
            index += 1

        return num_correct/len(original_df)


In [None]:
if os.path.isfile('resources/naive_bayes_model.pkl'):
  # Load the saved model from the file
  with open('resources/naive_bayes_model.pkl', 'rb') as file:
    nb = pickle.load(file)

else:
  nb = NaiveBayesClassifier()
  nb.fit(x_train, y_train)

  # Save the trained model to a file
  with open('resources/naive_bayes_model.pkl', 'wb') as file:
    pickle.dump(nb, file)

In [21]:
nb = NaiveBayesClassifier()
nb.fit(x_train, y_train)

In [22]:
validation_preds = np.array([nb.predict(x) for x in x_validation])
accuracy = np.mean(validation_preds == y_validation)
print(accuracy)

0.8158333333333333


In [11]:
testing_df = pd.read_csv('resources/testing.csv')

In [23]:
x_test = testing_df.iloc[:, 1:].values
x_test = scaler.fit_transform(x_test)

In [24]:
test_preds = np.array([int(nb.predict(x)) for x in x_test]).tolist()
ids = list(testing_df.iloc[:, :1].values)

if int(ids[0][0]) != 12001:
    ids.insert(0, np.array([12001]))
    test_preds.insert(0, 9)

In [25]:
with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'class'])

    for id, pred in zip(ids, test_preds):
        writer.writerow([id[0], pred])

#### Ranking words

In [49]:
def mutual_information(x, y, words, stopwords, penalty=0.5):
    n_samples, n_features = x.shape
    n_classes = len(np.unique(y))

    # Compute the frequency of each word in each class
    word_freq = np.zeros((n_features, n_classes))
    for i in range(n_classes):
        X_class = x[y == i, :]
        word_freq[:, i] = np.sum(X_class, axis=0)

    # Compute the total number of documents in each class and the total number of documents
    class_freq = np.sum(word_freq, axis=0)
    total_freq = np.sum(class_freq)

    # Compute the MI between each word and each class
    mi_scores = np.zeros(n_features)
    for j in range(n_features):
        n_j = np.sum(x[:, j] > 0)
        for i in range(n_classes):
            n_ij = word_freq[j, i]
            if n_ij == 0:
                continue
            p_ij = n_ij / n_samples
            p_i = class_freq[i] / n_samples
            p_j = n_j / n_samples
            mi_scores[j] += p_ij * np.log2(p_ij / (p_i * p_j))

        # Penalize stopwords in the MI score
        if words[j] in stopwords:
            mi_scores[j] -= penalty

    # Normalize the MI scores to have a range of [0, 1]
    mi_scores_norm = (mi_scores - np.min(mi_scores)) / \
        (np.max(mi_scores) - np.min(mi_scores))

    return mi_scores_norm


In [63]:
stop_words = set(stopwords.words('english'))
word_values = words.iloc[:, 0].values
mi = mutual_information(x_train, y_train, word_values, stop_words, penalty=0.8)

In [64]:
# print table of 10 most informative and 10 least informative words
print('Most Informative Words')
print(words.iloc[np.argsort(mi)[-10:]])
print('Least Informative Words')
print(words.iloc[np.argsort(mi)[:10]])

# write 100 most informative words to file
with open('resources/most_informative_words.txt', 'w') as file:
    for word in words.iloc[np.argsort(mi)[-100:]].values:
        file.write(word[0] + '\n')

Most Informative Words
                   0
27987         layton
27990      bloodshot
27992        bwsmith
22500         guykuo
28059  polypropylene
27897           maxx
28063        prothan
27804    shatterstar
26798           khoh
56592            jth
Least Informative Words
        0
921  have
29     in
32     to
41     it
28    the
977    am
51   with
143    be
437  only
59     is
MI Scores of Stopwords
of 0.06192881235371666
from 0.10182668153081469
and 0.09482550541296586
other 0.11191653664509267
are 0.1077173114351087
the 0.03769413396773495
in 0.0011474582383969984
to 0.029168881650963485
it 0.037320953655183335
on 0.08786818466986307
their 0.13880554679644946
but 0.06410081985744231
with 0.0546548631953915
is 0.06138748682033983
can 0.10120777093539329
for 0.07869139565389022
who 0.14420638259912863
so 0.09665312338971446
such 0.16229772008042267
by 0.16929777899011295
itself 0.17761658538062886
or 0.13059632422789824
below 0.1998684957315541
an 0.08724368290381708
which 0.11

-------------------------------------------------

In [None]:
# classes = np.unique(validation_df.iloc[:, -1].values, return_counts=False)
# cm = confusion_matrix(validation_pred_df['pred'].values, validation_df.iloc[:, -1].values, labels=classes)

In [None]:
# print(cm)
# sns.heatmap(cm, annot=True, cmap='Blues')
# plt.xlabel('Predicted label')
# plt.ylabel('True label')
# plt.show()