In [14]:
import pandas as pd
import numpy as np

# sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#extra
from sklearn.neural_network import MLPClassifier

# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_pickle("fully_labeled_data.pkl")

In [5]:
X = [] # sentiment labels
y = [] # stock labels

for index,row in data.iterrows():
    row_sent_labels = []
    
    for i in range(1, 26):
        label = data.at[index, "Top{}_sent".format(i)]
        row_sent_labels.append(label)
    X.append(row_sent_labels)
    y.append(data.at[index, "stock_label"])
    
X = np.array(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Naive Bayesian 

In [16]:
gnb = GaussianNB()
nb_y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, nb_y_pred))
print("Accuracy of Gaussian: {}".format(accuracy_score(y_test, nb_y_pred)))

              precision    recall  f1-score   support

           0       0.45      0.35      0.39       296
           1       0.55      0.66      0.60       361

   micro avg       0.52      0.52      0.52       657
   macro avg       0.50      0.50      0.50       657
weighted avg       0.51      0.52      0.51       657

Accuracy of Gaussian: 0.517503805175038


### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
log_y_pred = clf.predict(X_test)

print(classification_report(y_test, log_y_pred))
print("Accuracy of Log Reg: {}".format(accuracy_score(y_test, log_y_pred)))

              precision    recall  f1-score   support

           0       0.46      0.34      0.39       296
           1       0.56      0.68      0.61       361

   micro avg       0.53      0.53      0.53       657
   macro avg       0.51      0.51      0.50       657
weighted avg       0.51      0.53      0.51       657

Accuracy of Log Reg: 0.5251141552511416


### SVM

In [18]:
svc_clf = SVC(gamma='auto', kernel="poly")
svc_y_pred = svc_clf.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, svc_y_pred))
print("Accuracy of SVM: {}".format(accuracy_score(y_test, svc_y_pred)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       296
           1       0.55      1.00      0.71       361

   micro avg       0.55      0.55      0.55       657
   macro avg       0.27      0.50      0.35       657
weighted avg       0.30      0.55      0.39       657

Accuracy of SVM: 0.5494672754946728


  'precision', 'predicted', average, warn_for)
