In [20]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()

import nltk
from textblob import TextBlob
from scipy import sparse

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
train_text = 'training_text'
train_var = 'training_variants'

dfv = pd.read_csv(train_var)
dft = pd.read_csv(train_text, sep = '\|\|', engine = 'python', skiprows = 1, names = ['ID', 'Text'] )

In [4]:
dfv.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [5]:
dft.shape

(3321, 2)

In [27]:
df = pd.merge(dfv, dft, how = 'inner', on = 'ID')

In [34]:
df_small = df[:10]
pd.to_pickle(df_small, '../Project-three-mcnulty/sample.pkl')

In [35]:
y = df[['ID', 'Class']]
x = df[['ID', 'Text']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.25)

In [36]:
text = list(x_train.Text)

In [37]:
vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range=(1,3))
doc_vectors = vectorizer.fit_transform(text)
classes = np.array(y_train.Class)
test_vector = vectorizer.transform(x_test.Text)

In [38]:
# Multinomial Naive Bayes
model = MultinomialNB().fit(doc_vectors, classes)
predictions = model.predict(test_vector)

testing_NB = pd.merge(x_test, y_test, on='ID').dropna(how='all')
predict_NB = pd.concat([testing_NB, pd.Series(predictions)], axis = 1)
predict_NB.columns = ['ID', 'Text', 'Class', 'Predicted']
accuracy_NB = len(list(predict_NB[predict_NB.Class == predict_NB.Predicted].Class))/len(list(predict_NB.Class))


In [None]:
# Support Vector Classifier
model = LinearSVC().fit(doc_vectors, classes)
predictions = model.predict(test_vector)

testing_SVC = pd.merge(x_test, y_test, on='ID').dropna(how='all')
predict_SVC = pd.concat([testing_SVC, pd.Series(predictions)], axis = 1)
predict_SVC.columns = ['ID', 'Text', 'Class', 'Predicted']
accuracy_SVC = len(list(predict_SVC[predict_SVC.Class == predict_SVC.Predicted].Class))/len(list(predict_SVC.Class))


In [None]:
# Support Vector Classifier, return probability
model = SVC(kernel = 'linear', probability = True).fit(doc_vectors, classes)
predictions = model.predict_proba(test_vector)

testing_SVC1 = pd.merge(x_test, y_test, on='ID').dropna(how='all')
SVC1predictions = pd.DataFrame(predictions)
predict_SVC1 = pd.merge(testing_SVC1.reset_index(), SVC1predictions.reset_index(), on ='index')
predict_SVC1.columns = ['ID', 'Text', 'Class', 'Predicted', 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [None]:
# Random Forest Classifier
model = RandomForestClassifier(n_estimators = 20).fit(doc_vectors, classes)
predictions = model.predict(test_vector)

testing_RF = pd.merge(x_test, y_test, on='ID').dropna(how='all')
predict_RF = pd.concat([testing_RF, pd.Series(predictions)], axis = 1)
predict_RF.columns = ['ID', 'Text', 'Class', 'Predicted']
accuracy_RF = len(list(predict_RF[predict_RF.Class == predict_RF.Predicted].Class))/len(list(predict_RF.Class))


In [None]:
feature_importances = pd.DataFrame(model.feature_importances_)

decisionpath = model.decision_path(test_vector)
decisions = pd.DataFrame(decisionpath[0].toarray())

In [None]:
# Check some out-of-the-box accuracies
print('Naive Bayes Accuracy: %r' %accuracy_NB)
print('Support Vector Accuracy: %r' %accuracy_SVC)
print('Random Forest Accuracy: %r' %accuracy_RF)