# Sentiment Analysis

In this notebook it is showed the workflow of how we built the sentiment analysis model to classify the polarity of the financial tweets.

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import joblib
from collections import Counter
import numpy as np
import pandas as pd
from imblearn.datasets import make_imbalance
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

We used two different datasets of financial tweets to train and test our model:
the first dataset was downloaded from Kaggle at the following link and it contains labelled tweets ;
the test set includes real tweets that were scraped form Twitter and that we manually classified.

In [50]:
from preprocessing.tweet_cleaner import tweet_pruning, remove_special_char

train_data = pd.read_csv('./data/tweets_with_sentiment.csv')

# Preprocessing
train_data['text'] = train_data['text'].astype(str)
train_data['text'] = train_data['text'].str.lower()
train_data['text'] = train_data['text'].apply(remove_special_char)

train_data

Unnamed: 0,text,target
0,video offic mind busi david solomon tell gs in...,neutral
1,price lumber lb f sinc hit ytd high maci turna...,neutral
2,say american dream dead,negative
3,barri silbert extrem optimist bitcoin predict ...,positive
4,satellit avoid attack space junk circl earth paid,negative
...,...,...
28435,fb c f f cb ecf,neutral
28436,btc,neutral
28437,rt hd nuff said tel telcoin telfam crypto bloc...,neutral
28438,btc,neutral


In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Spot-Check Algorithms
classifiers = [
    RandomForestClassifier(n_estimators=200, min_samples_split=200, max_features = 3, random_state=1, max_depth=3),
    XGBClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(3),
    LogisticRegression(),
    MultinomialNB(),
    BernoulliNB
]

# Pipeline Classifier
pipelines = []

for classifier in classifiers:
    
    pipelines.append(Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier)
    ]))
    

In [22]:
x_train, x_test, y_train, y_test = train_test_split(train_data['text'], train_data['target'], test_size=0.2, random_state=11)

print("Before undersampling: ", Counter(y_train))

# Convert x_train to np_array for rebalance
#x_train = x_train.values.reshape(-1, 1)
#x_train, y_train = make_imbalance(x_train, y_train,
                                  #sampling_strategy={'positive': 2000, 'neutral': 2000, 'negative': 2000},
                                  #random_state=0)

# Return to pandas series
x_train = pd.Series(np.squeeze(x_train))
print("After undersampling: ", Counter(y_train))

Before undersampling:  Counter({'neutral': 13883, 'positive': 6785, 'negative': 2084})
After undersampling:  Counter({'neutral': 13883, 'positive': 6785, 'negative': 2084})


In [45]:
import warnings
warnings.filterwarnings('ignore')

for pipe in pipelines:
    # Training the Pipeline Classifier
    pipe.fit(x_train, y_train)

    # Testing of the Pipeline
    predicted = pipe.predict(x_test)

    # Extracting statistics and metrics
    print("\n Evaluation: ", pipe['clf'])
    accuracy = accuracy_score(predicted, y_test)
    print("Accuracy on test set: ", accuracy)
    print("Metrics per class on test set:")

    print("Confusion matrix:")
    metrics.confusion_matrix(y_test, predicted)

    print(metrics.classification_report(y_test, predicted, target_names=["negative", "neutral", "positive"]))


 Evaluation:  RandomForestClassifier(max_depth=3, max_features=3, min_samples_split=200,
                       n_estimators=200, random_state=1)
Accuracy on test set:  0.6060126582278481
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       514
     neutral       0.61      1.00      0.75      3447
    positive       0.00      0.00      0.00      1727

    accuracy                           0.61      5688
   macro avg       0.20      0.33      0.25      5688
weighted avg       0.37      0.61      0.46      5688


 Evaluation:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              

TypeError: _BaseDiscreteNB.fit() missing 1 required positional argument: 'y'

XGBoost Classifier has the best performance

In [None]:
# Save the classifier
filename = '../model/sentiment_classifier.pkl'
joblib.dump(pipelines[1], filename)

The performance using Kaggle dataset were excellente but we want to test our model also with tweets that we scraped directly from Twitter. 

In [64]:
pipe = joblib.load('../model/sentiment_classifier.pkl')

# Testing with real tweets
real_tweets = pd.read_json('../data/train/tweets_with_label.json')
print("Number of Tweets: ", Counter(real_tweets['target']))

# Preprocessing
real_tweets = real_tweets.rename(columns={'text': 'Text'})
real_tweets['Text'] = real_tweets['Text'].str.lower()
real_tweets = tweet_pruning(real_tweets, 'amazon', 'AMZN')
real_tweets['Text'] = real_tweets['Text'].apply(remove_special_char)

# Predicting
predicted = pipe.predict(real_tweets['Text'].values)

# Extracting statistics and metrics
accuracy = accuracy_score(real_tweets['target'], predicted)
print("Accuracy on test set: ", accuracy)
print("Metrics per class on test set:")

print("Confusion matrix:")
metrics.confusion_matrix(real_tweets['target'].values, predicted)

print(metrics.classification_report(real_tweets['target'].values, predicted, target_names=["negative", "neutral", "positive"]))

Number of Tweets:  Counter({'neutral': 566, 'positive': 143, 'negative': 50})
Accuracy on test set:  0.7901234567901234
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

    negative       0.82      0.63      0.71        43
     neutral       0.83      0.90      0.86        70
    positive       0.72      0.78      0.75        49

    accuracy                           0.79       162
   macro avg       0.79      0.77      0.77       162
weighted avg       0.79      0.79      0.79       162

