In [421]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV


import nltk
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

import pandas as pd

In [432]:
collectedData = pd.read_csv("csvFiles/combined-ratings.csv")
artificialData = pd.read_csv("csvFiles/artificialData.csv")

combinedData = pd.concat([collectedData, artificialData])

combinedDataDF = combinedData['comment']
ratingDataDF = combinedData['label']

ratingDataArray = ratingDataDF.values


In [539]:
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = text.lower()
    return text.strip()
data = [preprocess_text(t) for t in combinedDataDF]

In [751]:
#creating test and train
#x_train, x_test, y_train, y_test = train_test_split(combinedDataDF, ratingDataArray, test_size=0.15, random_state=42)

x_train, x_test, y_train, y_test = train_test_split(data, ratingDataArray, test_size=0.25, random_state=42)

In [650]:
# Import NLP tool kit
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
# Inspired by Sklearn Documentation
# Lemmatize the data through a class
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [702]:
cv_text = CountVectorizer(stop_words='english', ngram_range=(1, 2),lowercase=True, min_df=.02,max_df=.50, max_features=100)
#cv_text = CountVectorizer(tokenizer=LemmaTokenizer(),max_features = 1000,stop_words = 'english')

x_train_text = cv_text.fit_transform(x_train)
x_test_text = cv_text.transform(x_test)

  'stop_words.' % sorted(inconsistent))


In [703]:
print(len(cv_text.get_feature_names()))

99


In [704]:
x_train_text_df = pd.DataFrame(x_train_text.todense(), columns=[x+'_comment' for x in cv_text.get_feature_names()])

In [705]:
x_test_text_df = pd.DataFrame(x_test_text.todense(), columns=[x+'_text' for x in cv_text.get_feature_names()])

In [706]:
vecced_train_reddit_posts = x_train_text_df
vecced_test_reddit_posts = x_test_text_df

In [707]:
 mn_params = {
            'fit_prior': [True],
            'alpha': [0, 0.5, 1]}
        
M = GridSearchCV(MultinomialNB(),
                mn_params,
                cv = 5,
                verbose = 1,
                n_jobs = -1)

In [708]:
M.fit(vecced_train_reddit_posts.values, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0, 0.5, 1], 'fit_prior': [True]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [709]:
predictions = M.predict(vecced_test_reddit_posts.values)

In [710]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
print(confusion_matrix(y_test, predictions))

Train score = 0.6801242236024845
Test score = 0.5789473684210527
[[ 5  0 12  0  0]
 [ 1  0  8  0  1]
 [ 5  4 59  1  2]
 [ 0  1  5  0  2]
 [ 0  0  6  0  2]]


In [711]:
print(M.predict(cv_text.transform(["Buy Calls"])))
print(M.predict(cv_text.transform(["Buy Puts"])))
print(M.predict(cv_text.transform(["SPY Puts"])))
print(M.predict(cv_text.transform(["SPY Calls"])))

[3]
[3]
[3]
[3]


In [752]:
from sklearn.linear_model            import LogisticRegression

# Create a pipeline
pipe_1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [753]:
# Find the parameters, gridsearch them, use the best parameters to generate a score
pipe_params_1 = {
    'lr__penalty': ['l1','l2'],
    'cvec__max_features': [100,300, 500],
    'cvec__min_df': [2,3],
    'cvec__max_df': [.5,.9],
    'cvec__ngram_range': [(1,1),(1,2),(3,3)]
}

# Gridsearch to find the best parameters and fit to training data
gs_1 = GridSearchCV(pipe_1, param_grid=pipe_params_1,
                  cv=3, 
                  verbose = 1,
                  n_jobs=2)
gs_1.fit(x_train, y_train)
best_1 = gs_1.best_estimator_
best_1.fit(x_train,y_train)
y_test_preds_1 = best_1.predict(x_test)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 202 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done 216 out of 216 | elapsed:    3.9s finished


In [754]:
# Develop a score and print
y_train_preds_1 = best_1.predict(x_train)
print(accuracy_score(y_train, y_train_preds_1))
print(accuracy_score(y_test,y_test_preds_1))
dfparams = pd.DataFrame(gs_1.best_params_)
dfparams = dfparams.drop(index = 0).T
dfparams = dfparams.rename(index=str, columns={1: "Best Params"})
dfparams

0.9083850931677019
0.6403508771929824


Unnamed: 0,Best Params
cvec__max_df,0.5
cvec__max_features,500
cvec__min_df,3
cvec__ngram_range,1
lr__penalty,l2


In [755]:
from sklearn.pipeline                import Pipeline
# Code Inspired by Siraj Raval
pipe_3 = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [756]:
# Find the parameters, gridsearch them, use the best features to generate a score
pipe_params_3 = {
    'cvec__max_features': [100,500],
    'cvec__min_df': [2,3],
    'cvec__max_df': [.9,.95],
    'cvec__ngram_range': [(1,1),(1,2),(3,3)]
}

# Gridsearch to find the best parameters and fit to training data
gs_3 = GridSearchCV(pipe_3, 
                   param_grid=pipe_params_3, 
                   cv = 3,
                   verbose = 1,
                   n_jobs = 2)

gs_3.fit(x_train, y_train)
best_3 = gs_3.best_estimator_
best_3.fit(x_train,y_train)
y_test_preds_3 = best_3.predict(x_test)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  72 out of  72 | elapsed:    1.7s finished


In [757]:

# Develop a score and print
y_train_preds_3 = best_3.predict(x_train)
print(accuracy_score(y_train, y_train_preds_3))
print(accuracy_score(y_test,y_test_preds_3))
dfparams_3 = pd.DataFrame(gs_3.best_params_)
dfparams_3 = dfparams_3.drop(index = 0).T
dfparams_3 = dfparams_3.rename(index=str, columns={1: "Best Params"})
dfparams_3

0.8229813664596274
0.5526315789473685


Unnamed: 0,Best Params
cvec__max_df,0.9
cvec__max_features,500.0
cvec__min_df,2.0
cvec__ngram_range,1.0


In [758]:
# Code Inspired by Siraj Raval
pipe_4 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [759]:
# Find the parameters, gridsearch them, use the best features to generate a score
pipe_params_4 = {
    'tfidf__max_features': [100,200,300, 400, 500],
    'tfidf__min_df': [2,3],
    'tfidf__max_df': [.9,.95],
    'tfidf__ngram_range': [(1,1),(1,2),(3,3)]
}

In [760]:
# Gridsearch to find the best parameters and fit to training data
gs_4 = GridSearchCV(pipe_4, 
                   param_grid=pipe_params_4, 
                   cv = 3,
                   verbose = 1,
                   n_jobs = 2)

gs_4.fit(x_train, y_train)
best_4 = gs_4.best_estimator_
best_4.fit(x_train,y_train)
y_test_preds_4 = best_4.predict(x_test)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 174 tasks      | elapsed:    2.9s
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:    3.0s finished


In [761]:
# Develop a score and print
y_train_preds_4 = best_4.predict(x_train)
print(accuracy_score(y_train, y_train_preds_4))
print(accuracy_score(y_test,y_test_preds_4))
dfparams_4 = pd.DataFrame(gs_4.best_params_)
dfparams_4 = dfparams_4.drop(index = 0).T
dfparams_4 = dfparams_4.rename(index=str, columns={1: "Best Params"})
dfparams_4

0.6847826086956522
0.631578947368421


Unnamed: 0,Best Params
tfidf__max_df,0.9
tfidf__max_features,200.0
tfidf__min_df,3.0
tfidf__ngram_range,1.0


In [739]:
from sklearn.ensemble                import RandomForestClassifier

pipe_5 = Pipeline([
    ('cvec', CountVectorizer()),
    ('rfc', RandomForestClassifier())])

In [740]:

# Find the parameters, gridsearch them, use the best features to generate a score
pipe_params_5 = [{
    'cvec__max_features': [300, 400, 500],
    'cvec__min_df': [2,3],
    'cvec__max_df': [.9],
    'cvec__ngram_range': [(1,1),(1,2)],
    'rfc__bootstrap': [False, True],
    'rfc__n_estimators': [100, 110, 120],
    'rfc__max_features': [.5, .6, .7],
    'rfc__min_samples_leaf': [10,12, 14],
    'rfc__min_samples_split':[3,5,7]
}]

In [741]:
# Since random forest has more features, consider how many fits you will have to do before running
lst = []
count = 0
for i in pipe_params_5[0]:
    count = 0
    for j in pipe_params_5[0][i]:
        count += 1
    lst.append(count)

first = lst[0]
num = 1
for i in lst:
    num*=i
print(f'Fits: {num*3}')

Fits: 5832


In [742]:
# Gridsearch to find the best parameters and fit to training data
gs_5 = GridSearchCV(pipe_5, 
                   param_grid=pipe_params_5, 
                   cv = 3,
                   verbose = 1,
                   n_jobs = -1)

gs_5.fit(x_train, y_train)
best_5 = gs_5.best_estimator_
best_5.fit(x_train,y_train)
y_test_preds_5 = best_5.predict(x_test)

Fitting 3 folds for each of 1944 candidates, totalling 5832 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 5832 out of 5832 | elapsed:  5.4min finished


In [743]:
# Develop a score and print
y_train_preds_5 = best_5.predict(x_train)
print(accuracy_score(y_train, y_train_preds_5))
print(accuracy_score(y_test,y_test_preds_5))
dfparams_5 = pd.DataFrame(gs_5.best_params_)
dfparams_5 = dfparams_5.drop(index = 0).T
dfparams_5 = dfparams_5.rename(index=str, columns={1: "Best Params"})
dfparams_5

0.6901408450704225
0.6631578947368421


Unnamed: 0,Best Params
cvec__max_df,0.9
cvec__max_features,300
cvec__min_df,2
cvec__ngram_range,1
rfc__bootstrap,False
rfc__max_features,0.5
rfc__min_samples_leaf,12
rfc__min_samples_split,3
rfc__n_estimators,110
