In [139]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import *

In [180]:
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
y_train = pd.read_pickle('y_train.pkl')

In [181]:
y_train.shape

(36,)

In [182]:
# 
#     Eliminating capital letters and punctuation;
#     Using a stemmer or a lemmatizer;
#     Tokenizing; and
#     Employing CountVectorizer() or TfidfVectorizer()
# 

In [183]:
def clean(doc):
    d = doc.split(' ')
    regex = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    s_stemmer = SnowballStemmer(language = 'english')
    
    d = regex.tokenize(doc)
    d = [word.lower() for word in d]
    d = [s_stemmer.stem(word) for word in d]
    d = ' '.join(d)
    return d

X_train = X_train.map(clean)
X_test = X_test.map(clean)

In [184]:
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state = 42)

In [185]:
cv = CountVectorizer()
X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = cv.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace = True)

mnb = MultinomialNB()
mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
confusion_matrix(y_val, y_hat)

array([[1, 0, 0, 3],
       [0, 0, 0, 2],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int64)

In [186]:
tfidf = TfidfVectorizer()
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace = True)

mnb = MultinomialNB()
mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
confusion_matrix(y_val, y_hat)

array([[0, 0, 0, 4],
       [0, 0, 0, 2],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int64)

In [187]:
cv = CountVectorizer(min_df = 0.10, max_df = 0.90)
X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = cv.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace = True)

mnb = MultinomialNB()
mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
confusion_matrix(y_val, y_hat)

array([[1, 0, 0, 3],
       [2, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int64)

In [188]:
tfidf = TfidfVectorizer(min_df = 0.10, max_df = 0.90)
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace = True)

mnb = MultinomialNB()
mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
confusion_matrix(y_val, y_hat)

array([[0, 0, 0, 4],
       [0, 0, 0, 2],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int64)

In [189]:
cv = CountVectorizer(min_df = 0.0, max_df = 0.90)
X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = cv.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace = True)

mnb = MultinomialNB()
mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
confusion_matrix(y_val, y_hat)

array([[1, 0, 0, 3],
       [2, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int64)

In [190]:
from imblearn.over_sampling import SMOTE

In [128]:
X_t_vec.shape

(28, 1858)

In [133]:
y_t.value_counts()

R     14
D     10
DR     2
F      1
W      1
Name: 1, dtype: int64

In [132]:
sm = SMOTE(k_neighbors = 1)

X_sm, y_sm = sm.fit_resample(X_t_vec, y_t)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 2

In [193]:
X_test_vec = cv.transform(X_test)
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vec)
X_test_vec.columns = sorted(cv.vocabulary_)
# X_test_vec.set_index(y_t.index, inplace = True)

In [194]:
preds = mnb.predict(X_test_vec)

In [195]:
preds

array(['D', 'R', 'R', 'D', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'R',
       'R', 'D', 'R', 'R', 'R', 'R', 'R', 'R', 'D'], dtype='<U2')

In [207]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_array = encoder.fit_transform(preds)
decoded_array = encoder.inverse_transform(encoded_array)
print(decoded_array)

['D' 'R' 'R' 'D' 'R' 'R' 'R' 'R' 'R' 'D' 'R' 'R' 'R' 'R' 'D' 'R' 'R' 'R'
 'R' 'R' 'R' 'D']


In [208]:
pd.DataFrame(decoded_array).to_pickle("andrew_reuben_predictions.pkl")

In [210]:
import pickle
pickle.dump(decoded_array, open("andrew_reuben_predictions.pkl", "w"))

TypeError: write() argument must be str, not bytes

In [218]:
pickle.dump(np.array(preds), open("andrew_reuben_predictions.pkl", "wb"))

In [215]:
import numpy as np