In [775]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV


import nltk
import time
import string

from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

import pandas as pd

In [432]:
collectedData = pd.read_csv("csvFiles/combined-ratings.csv")
artificialData = pd.read_csv("csvFiles/artificialData.csv")

combinedData = pd.concat([collectedData, artificialData])

combinedDataDF = combinedData['comment']
ratingDataDF = combinedData['label']

ratingDataArray = ratingDataDF.values


In [867]:
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = text.lower()
    text = re.sub('r/','', text)
    text = re.sub('\[.*?\]','',text) # the characters in the bracket will be replaced with nothing
    text = re.sub('[%s]' % re.escape(string.punctuation),' ', text) # Punctuation replaced with nothing
    text = re.sub('\w*\d\w*', '', text) # digits replaced with nothing
    return text.strip()
data = [preprocess_text(t) for t in combinedDataDF]

In [868]:
#creating test and train
#x_train, x_test, y_train, y_test = train_test_split(combinedDataDF, ratingDataArray, test_size=0.15, random_state=42)

x_train, x_test, y_train, y_test = train_test_split(data, ratingDataArray, test_size=0.25, random_state=42)

In [963]:
cv_text = CountVectorizer( ngram_range=(1, 2), min_df=2,max_df=.9, max_features=100)
#cv_text = CountVectorizer(tokenizer=LemmaTokenizer(),max_features = 1000,stop_words = 'english')

x_train_text = cv_text.fit_transform(x_train)
x_test_text = cv_text.transform(x_test)

In [964]:
print(len(cv_text.get_feature_names()))

100


In [965]:
x_train_text_df = pd.DataFrame(x_train_text.todense(), columns=[x+'_comment' for x in cv_text.get_feature_names()])

In [966]:
x_test_text_df = pd.DataFrame(x_test_text.todense(), columns=[x+'_text' for x in cv_text.get_feature_names()])

In [967]:
vecced_train_reddit_posts = x_train_text_df
vecced_test_reddit_posts = x_test_text_df

In [968]:
mn_params = {
            'fit_prior': [True],
            'alpha': [0, 0.5, 1]}
        
M = GridSearchCV(MultinomialNB(),
                mn_params,
                cv = 5,
                verbose = 1,
                n_jobs = -1)

In [969]:
M.fit(vecced_train_reddit_posts.values, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0, 0.5, 1], 'fit_prior': [True]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [970]:
predictions = M.predict(vecced_test_reddit_posts.values)

In [971]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
print(confusion_matrix(y_test, predictions))

Train score = 0.7323943661971831
Test score = 0.6421052631578947
[[  6   0  22   0   1]
 [  0   1  13   0   0]
 [  8   2 110   1   1]
 [  2   0   8   0   0]
 [  1   0   9   0   5]]


In [972]:
print(M.predict(cv_text.transform(["Buy Calls"])))
print(M.predict(cv_text.transform(["Buy Puts"])))
print(M.predict(cv_text.transform(["SPY Puts"])))
print(M.predict(cv_text.transform(["SPY Calls"])))

[5]
[1]
[3]
[5]


In [973]:
import pickle
f = open('MultinomialNB.pickle', 'wb')
pickle.dump(M, f)
f.close()

In [974]:
f = open('CountVectorizer.pickle', 'wb')
pickle.dump(cv_text, f)
f.close()