# Naive Bayes

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from naive_bayes import MultinomialNaiveBayes, PoissonNaiveBayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
def split(X, y, ratio=0.2):
    train_data, test_data, train_labels, test_labels = train_test_split(
        X, y, test_size=ratio, random_state=105)
    return train_data, test_data, train_labels, test_labels

In [None]:
def train(classifier, X, y, use_tfidf=True):
    cvec = CountVectorizer(analyzer="word", 
                           stop_words="english",
                           ngram_range=(1, 5))

    if use_tfidf:
        classification = Pipeline([("vectorizer", cvec),
                                   ("transformer", TfidfTransformer()),
                                   ("classifier", classifier)])
    else:
        classification = Pipeline([("vectorizer", cvec),
                                   ("classifier", classifier)])        

    classification = classification.fit(X, y)
    return classification

In [None]:
def score(pred, y):
    print("accuracy: {}".format(np.mean(pred == y)))
    print(metrics.classification_report(y, pred))
    confusion_matrix = metrics.confusion_matrix(y, pred)
    return confusion_matrix

In [None]:
def plot(matrix, title):
    categories = ["hockey", "movies", "nba", "news", "nfl", "politics",
                  "soccer", "worldnews"]
    df = pd.DataFrame(matrix, index=categories, columns=categories)
    plt.figure(figsize=(10, 7))
    sns.heatmap(df, annot=True, fmt="g")
    plt.savefig(title)
    plt.show()

## Split Data

In [None]:
X = pd.read_csv("../data/train_input.csv")["conversation"]
y = pd.read_csv("../data/train_output.csv")["category"]
X_test = pd.read_csv("../data/test_input.csv")["conversation"]

In [None]:
train_X, test_X, train_y, test_y = split(X, y)

## Results With TF-IDF

### Multinomial Naive Bayes

In [None]:
mnb = MultinomialNaiveBayes()
mnb = train(mnb, train_X, train_y, use_tfidf=True)

In [None]:
mnb_prediction = mnb.predict(test_X)

In [None]:
mnb_confusion_matrix = score(mnb_prediction, test_y)
plot(mnb_confusion_matrix, "MultinomialNaiveBayesTFIDF")

### Poisson Naive Bayes

In [None]:
pnb = PoissonNaiveBayes()
pnb = train(pnb, train_X, train_y, use_tfidf=True)

In [None]:
pnb_prediction = pnb.predict(test_X)

In [None]:
pnb_confusion_matrix = score(pnb_prediction, test_y)
plot(pnb_confusion_matrix, "PoissonNaiveBayesTFIDF")

## Results Without TF-IDF

### Multinomial Naive Bayes

In [None]:
mnb = MultinomialNaiveBayes()
mnb = train(mnb, train_X, train_y, use_tfidf=False)

In [None]:
mnb_prediction = mnb.predict(test_X)

In [None]:
mnb_confusion_matrix = score(mnb_prediction, test_y)
plot(mnb_confusion_matrix, "MultinomialNaiveBayesNoTFIDF")

### Poisson Naive Bayes

In [None]:
pnb = PoissonNaiveBayes()
pnb = train(pnb, train_X, train_y, use_tfidf=False)

In [None]:
pnb_prediction = pnb.predict(test_X)

In [None]:
pnb_confusion_matrix = score(pnb_prediction, test_y)
plot(pnb_confusion_matrix, "PoissonNaiveBayesNoTFIDF")