In [6]:
import pandas as pd
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
import operator
import numpy as np
from textblob import TextBlob
from transformers import pipeline


In [7]:
training_data = pd.read_csv('df_english.csv')
training_data = training_data[training_data.is_english == True]


In [8]:
training_data

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,is_english
0,0,86ff1ea1-0b63-43ce-addc-eb43f6193b3b,Yaseen Yaseen,https://play-lh.googleusercontent.com/a/ALm5wu...,Yaeen Yaeen gg,5,0,,2022-10-04 20:32:28,,,True
1,1,3577f7a1-3394-4e77-813d-095a82cf8bcf,Kemar Richardson,https://play-lh.googleusercontent.com/a-/ACNPE...,Great,5,0,26.3.4,2022-10-04 20:32:10,,,True
2,2,7c8c56d9-d8ad-47d4-b24b-5289aa4529ff,Tracy Dunn,https://play-lh.googleusercontent.com/a/ALm5wu...,good,5,0,26.4.3,2022-10-04 20:31:21,,,True
3,3,80db804f-cccd-4b09-b690-abc12cbf0612,SG. Mugo. (Mugoz:),https://play-lh.googleusercontent.com/a-/ACNPE...,Good app,5,0,26.3.4,2022-10-04 20:30:22,,,True
4,4,4ed35e90-0f45-4865-81c4-b3a6f2ea49f7,Mwansa Judy,https://play-lh.googleusercontent.com/a-/ACNPE...,Most amazing app,5,0,26.3.4,2022-10-04 20:29:25,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,b1e4a582-bf47-46ad-a4dc-884468d8eddd,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Best app in pakistan tik toker Soham anwar Th...,5,1,25.8.5,2022-08-29 23:21:07,,,True
99996,99996,95df7741-4bbf-4d04-a60c-03b11f3ba268,Ethio culture,https://play-lh.googleusercontent.com/a-/ACNPE...,Perfect,4,0,25.9.4,2022-08-29 23:20:52,,,True
99997,99997,3e6ae27e-d761-4079-98a1-b5e95f296546,Zafar Awan,https://play-lh.googleusercontent.com/a/ALm5wu...,So nice,5,1,25.8.5,2022-08-29 23:19:05,,,True
99998,99998,380379d5-7c99-4e0d-a140-6cfd898aa619,Anger Kuiel,https://play-lh.googleusercontent.com/a/ALm5wu...,Its a great app,5,1,25.8.5,2022-08-29 23:18:53,,,True


We will be assigning the ratings of (1-2) to negative, (3) to neutral and (4-5) to positve

In [9]:
def get_rating(df):
    tmp = []
    for _, row in df.iterrows():
        if row["score"] >= 4:
            tmp.append("positive")
        elif row["score"] == 3:
            tmp.append("neutral")
        else:
            tmp.append("negative")
    df["rating_sentiment"] = tmp
get_rating(training_data)

Before we actually do anything with the training dataset, we may want to do some bench marking with some of the existing pre-trained model to find out how accurate these models are able to label the tweets. The labelled data follow the following format 0 being negative, 2 being neutral and 4 being positive. From there we can perform some cleaning on the content and try to observe how different cleaning method should be adopted to tweeter data. Before we actually start training our own model.

Some of the common models that we are going to use for sentiment analysis bench marking are NLTK Vader, text blob (both are rules-based sentiment analyzer), transformer based model from hugging face.

We will then check the performance of each model given different cleaning methods. The performance metrics we will be using are Accuracy (correct predictions over all predictions), Precision (how many of the positive predictions made are correct), Specificity (a measure of how many negative predictions made are correct), and F1-Score (the harmonic mean of precision and recall)



Perform prediction on exisiting training data without any cleaning

Performing test on training data using NLTK Vader

In [10]:

sia = SentimentIntensityAnalyzer()
training_data["nltk_sentiment_score"] = training_data["content"].apply(
    lambda x: sia.polarity_scores(str(x))["compound"])
training_data["nltk_sentiment"] = np.select([training_data["nltk_sentiment_score"] < 0, training_data["nltk_sentiment_score"] == 0, training_data["nltk_sentiment_score"] > 0],
                            ['negative', 'neutral', 'positive'])


Check the prediction accuracy when we use nltk without additional cleaning

In [11]:
predicted = training_data["nltk_sentiment"]
actual = training_data["rating_sentiment"]
target_names = ['positive', 'neutral', 'negative']
print(classification_report(actual, predicted, target_names=target_names))



              precision    recall  f1-score   support

    positive       0.59      0.34      0.43     16534
     neutral       0.05      0.25      0.08      4233
    negative       0.87      0.76      0.81     74144

    accuracy                           0.66     94911
   macro avg       0.50      0.45      0.44     94911
weighted avg       0.79      0.66      0.71     94911



Performing test on training data using text blob

In [12]:
training_data["txt_blob_sentiment_score"] = training_data["content"].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity)
training_data["txt_blob_sentiment"] = np.select([training_data["txt_blob_sentiment_score"] < 0, training_data["txt_blob_sentiment_score"] == 0, training_data["txt_blob_sentiment_score"] > 0],
                            ['negative', 'neutral', 'positive'])


In [13]:
predicted = training_data["txt_blob_sentiment"]
actual = training_data["rating_sentiment"]
target_names = ['positive', 'neutral', 'negative']
print(classification_report(actual, predicted, target_names=target_names))


              precision    recall  f1-score   support

    positive       0.64      0.23      0.34     16534
     neutral       0.05      0.32      0.09      4233
    negative       0.87      0.71      0.78     74144

    accuracy                           0.61     94911
   macro avg       0.52      0.42      0.40     94911
weighted avg       0.79      0.61      0.67     94911



In [14]:

sentiment_pipeline = pipeline("sentiment-analysis")
training_data["pipeline_sentiment_score"] = training_data["content"].apply(
    lambda x: sentiment_pipeline([str(x)])[0]["label"])



No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading: 100%|██████████| 629/629 [00:00<00:00, 89.9kB/s]
Downloading: 100%|██████████| 268M/268M [00:08<00:00, 30.9MB/s] 
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 6.01kB/s]
Downloading: 100%|██████████| 232k/232k [00:04<00:00, 50.2kB/s] 


In [15]:
training_data

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,is_english,rating_sentiment,nltk_sentiment_score,nltk_sentiment,txt_blob_sentiment_score,txt_blob_sentiment,pipeline_sentiment_score
0,0,86ff1ea1-0b63-43ce-addc-eb43f6193b3b,Yaseen Yaseen,https://play-lh.googleusercontent.com/a/ALm5wu...,Yaeen Yaeen gg,5,0,,2022-10-04 20:32:28,,,True,positive,0.2960,positive,0.00,neutral,NEGATIVE
1,1,3577f7a1-3394-4e77-813d-095a82cf8bcf,Kemar Richardson,https://play-lh.googleusercontent.com/a-/ACNPE...,Great,5,0,26.3.4,2022-10-04 20:32:10,,,True,positive,0.6249,positive,0.80,positive,POSITIVE
2,2,7c8c56d9-d8ad-47d4-b24b-5289aa4529ff,Tracy Dunn,https://play-lh.googleusercontent.com/a/ALm5wu...,good,5,0,26.4.3,2022-10-04 20:31:21,,,True,positive,0.4404,positive,0.70,positive,POSITIVE
3,3,80db804f-cccd-4b09-b690-abc12cbf0612,SG. Mugo. (Mugoz:),https://play-lh.googleusercontent.com/a-/ACNPE...,Good app,5,0,26.3.4,2022-10-04 20:30:22,,,True,positive,0.4404,positive,0.70,positive,POSITIVE
4,4,4ed35e90-0f45-4865-81c4-b3a6f2ea49f7,Mwansa Judy,https://play-lh.googleusercontent.com/a-/ACNPE...,Most amazing app,5,0,26.3.4,2022-10-04 20:29:25,,,True,positive,0.6240,positive,0.55,positive,POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,b1e4a582-bf47-46ad-a4dc-884468d8eddd,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Best app in pakistan tik toker Soham anwar Th...,5,1,25.8.5,2022-08-29 23:21:07,,,True,positive,0.7964,positive,0.60,positive,POSITIVE
99996,99996,95df7741-4bbf-4d04-a60c-03b11f3ba268,Ethio culture,https://play-lh.googleusercontent.com/a-/ACNPE...,Perfect,4,0,25.9.4,2022-08-29 23:20:52,,,True,positive,0.5719,positive,1.00,positive,POSITIVE
99997,99997,3e6ae27e-d761-4079-98a1-b5e95f296546,Zafar Awan,https://play-lh.googleusercontent.com/a/ALm5wu...,So nice,5,1,25.8.5,2022-08-29 23:19:05,,,True,positive,0.4754,positive,0.60,positive,POSITIVE
99998,99998,380379d5-7c99-4e0d-a140-6cfd898aa619,Anger Kuiel,https://play-lh.googleusercontent.com/a/ALm5wu...,Its a great app,5,1,25.8.5,2022-08-29 23:18:53,,,True,positive,0.6249,positive,0.80,positive,POSITIVE


In [16]:
predicted = [x.lower() for x in training_data["pipeline_sentiment_score"]]
actual = training_data["rating_sentiment"]
target_names = ['positive', 'neutral', 'negative']
print(classification_report(actual, predicted, target_names=target_names))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    positive       0.42      0.68      0.52     16534
     neutral       0.00      0.00      0.00      4233
    negative       0.89      0.82      0.85     74144

    accuracy                           0.76     94911
   macro avg       0.44      0.50      0.46     94911
weighted avg       0.77      0.76      0.76     94911



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# save training_data as csv
training_data.to_csv("trained_data.csv")