# Sentiment Analysis

In this notebook it is showed the workflow of how we built the sentiment analysis model to classify the polarity of the financial tweets.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import joblib
from collections import Counter
import numpy as np
import pandas as pd
from imblearn.datasets import make_imbalance
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

We used two different datasets of financial tweets to train and test our model:
the first dataset was downloaded from Kaggle at the following link and it contains labelled tweets ;
the test set includes real tweets that were scraped form Twitter and that we manually classified.

In [2]:
from preprocessing.tweet_cleaner import tweet_pruning, remove_special_char

train_data = pd.read_csv('./data/tweets_with_sentiment.csv')

# Preprocessing
train_data['text'] = train_data['text'].astype(str)
train_data['text'] = train_data['text'].str.lower()
train_data['text'] = train_data['text'].apply(remove_special_char)

train_data

Unnamed: 0,text,target
0,video offic mind busi david solomon tell gs in...,neutral
1,price lumber lb f sinc hit ytd high maci turna...,neutral
2,say american dream dead,negative
3,barri silbert extrem optimist bitcoin predict ...,positive
4,satellit avoid attack space junk circl earth paid,negative
...,...,...
28435,fb c f f cb ecf,neutral
28436,btc,neutral
28437,rt hd nuff said tel telcoin telfam crypto bloc...,neutral
28438,btc,neutral


In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Spot-Check Algorithms
classifiers = [
    RandomForestClassifier(),
    XGBClassifier(eval_metric='mlogloss'),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    BernoulliNB()
]

# Pipeline Classifier
pipelines = []

for classifier in classifiers:
    
    pipelines.append(Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier)
    ]))

In [4]:
print("Before undersampling: ", Counter(y_train))

# Convert x_train to np_array for rebalance
#x_train = x_train.values.reshape(-1, 1)
#x_train, y_train = make_imbalance(x_train, y_train,
                                  #sampling_strategy={'positive': 2000, 'neutral': 2000, 'negative': 2000},
                                  #random_state=0)

# Return to pandas series
x_train = pd.Series(np.squeeze(x_train))
print("After undersampling: ", Counter(y_train))

Before undersampling:  Counter({'neutral': 13883, 'positive': 6785, 'negative': 2084})
After undersampling:  Counter({'neutral': 13883, 'positive': 6785, 'negative': 2084})


In [31]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import time
import warnings
warnings.filterwarnings('ignore')

for pipe in pipelines:
    t0 = time.time()
    predicted = cross_val_predict(pipe, train_data['text'], train_data['target'], cv=10)
    t1 = time.time()
    t = (t1-t0)/10
    
    print("\n Evaluation: ", pipe['clf'], " \tTraining time: ", t)
    print(metrics.classification_report(train_data['target'], predicted, target_names=["negative", "neutral", "positive"]))


 Evaluation:  RandomForestClassifier()  	Training time:  24.588970875740053
              precision    recall  f1-score   support

    negative       0.96      0.83      0.89      2598
     neutral       0.96      0.98      0.97     17330
    positive       0.96      0.95      0.96      8512

    accuracy                           0.96     28440
   macro avg       0.96      0.92      0.94     28440
weighted avg       0.96      0.96      0.96     28440


 Evaluation:  XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=No

The performance using Kaggle dataset were excellent but we want to test our model also with tweets that we scraped directly from Twitter. 

In [34]:
# Testing with real tweets
real_tweets = pd.read_json('../data/train/tweets_with_label.json')
print("Number of Tweets: ", Counter(real_tweets['target']))

# Preprocessing
real_tweets = real_tweets.rename(columns={'text': 'Text'})
real_tweets['Text'] = real_tweets['Text'].str.lower()
real_tweets = tweet_pruning(real_tweets, 'amazon', 'AMZN')
real_tweets['Text'] = real_tweets['Text'].apply(remove_special_char)

for pipe in pipelines:
    pipe.fit(train_data['text'], train_data['target'])
    
    t0 = time.time()    
    predicted = pipe.predict(real_tweets['Text'].values)
    t1 = time.time()
    t = (t1-t0)
    
    print("\n Evaluation: ", pipe['clf'], " \tPrediction time: ", t)
    print(metrics.classification_report(real_tweets['target'].values, predicted, target_names=["negative", "neutral", "positive"]))

Number of Tweets:  Counter({'neutral': 566, 'positive': 143, 'negative': 50})


NotFittedError: Vocabulary not fitted or provided

XGBoost Classifier has the best performance

In [33]:
# Save the classifier
filename = '../model/sentiment_classifier.pkl'
joblib.dump(pipelines[1], filename)

['../model/sentiment_classifier.pkl']

In [32]:
pipe = joblib.load('../model/sentiment_classifier.pkl')

# Testing with real tweets
real_tweets = pd.read_json('../data/train/tweets_with_label.json')
print("Number of Tweets: ", Counter(real_tweets['target']))

# Preprocessing
real_tweets = real_tweets.rename(columns={'text': 'Text'})
real_tweets['Text'] = real_tweets['Text'].str.lower()
real_tweets = tweet_pruning(real_tweets, 'amazon', 'AMZN')
real_tweets['Text'] = real_tweets['Text'].apply(remove_special_char)

# Predicting
predicted = pipe.predict(real_tweets['Text'].values)


# Extracting statistics and metrics
accuracy = accuracy_score(real_tweets['target'], predicted)
print("Accuracy on test set: ", accuracy)
print("Metrics per class on test set:")

print("Confusion matrix:")
metrics.confusion_matrix(real_tweets['target'].values, predicted)

print(metrics.classification_report(real_tweets['target'].values, predicted, target_names=["negative", "neutral", "positive"]))

Number of Tweets:  Counter({'neutral': 566, 'positive': 143, 'negative': 50})
Accuracy on test set:  0.7926829268292683
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

    negative       0.82      0.63      0.71        43
     neutral       0.83      0.90      0.86        70
    positive       0.73      0.78      0.75        51

    accuracy                           0.79       164
   macro avg       0.79      0.77      0.78       164
weighted avg       0.79      0.79      0.79       164

