In [56]:
import numpy as np
import pandas as pd

### Importing dataset

In [57]:
df = pd.read_csv('data/train.tsv', delimiter = '\t', quoting = 3)

In [58]:
df.shape

(156060, 4)

In [59]:
df.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [60]:
df['Sentiment'].unique()

array([1, 2, 3, 4, 0], dtype=int64)

In [61]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


### Data Preprocessing

In [62]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zianz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Remove all special and numeric character from data and also remove stopwards and apply stemming

In [63]:
corpus=[]

for i in range(0, df.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', df['Phrase'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [64]:
corpus

['seri escapad demonstr adag good goos also good gander occasion amus none amount much stori',
 'seri escapad demonstr adag good goos',
 'seri',
 '',
 'seri',
 'escapad demonstr adag good goos',
 '',
 'escapad demonstr adag good goos',
 'escapad',
 'demonstr adag good goos',
 'demonstr adag',
 'demonstr',
 'adag',
 '',
 'adag',
 'good goos',
 '',
 'good goos',
 '',
 'good goos',
 '',
 'good goos',
 'good',
 'goos',
 '',
 'goos',
 'goos',
 'also good gander occasion amus none amount much stori',
 'also good gander occasion amus none amount much stori',
 'also',
 'also',
 'good gander occasion amus none amount much stori',
 'gander occasion amus none amount much stori',
 'gander occasion amus none amount much stori',
 'gander',
 'gander',
 'gander',
 '',
 'occasion amus none amount much stori',
 '',
 '',
 '',
 '',
 'occasion amus none amount much stori',
 'occasion',
 'amus none amount much stori',
 'amus',
 'none amount much stori',
 '',
 'none amount much stori',
 'none',
 'amount much

### Data transformation

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)

In [66]:
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

In [67]:
# Saving BoW dictionary to later use in prediction
import pickle
bow_path = 'c1_BoW_Sentiment_Model.pkl'
pickle.dump(cv, open(bow_path, "wb"))

### Dividing dataset into training and test set

In [68]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

### Model fitting (GaussianNB)

In [69]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [70]:
# Exporting NB Classifier to later use in prediction
import joblib
joblib.dump(classifier, 'c2_Classifier_Sentiment_Model') 

['c2_Classifier_Sentiment_Model']

### GaussianNB Model performance

In [71]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

[[1184   69   12    7  144]
 [3380  549   96  121 1381]
 [5286  871  549  706 8227]
 [1156  180  108  494 4769]
 [ 111   18   11   89 1694]]
0.1432141484044598
              precision    recall  f1-score   support

           0       0.11      0.84      0.19      1416
           1       0.33      0.10      0.15      5527
           2       0.71      0.04      0.07     15639
           3       0.35      0.07      0.12      6707
           4       0.10      0.88      0.19      1923

    accuracy                           0.14     31212
   macro avg       0.32      0.39      0.14     31212
weighted avg       0.50      0.14      0.11     31212



### Model fitting (MultinomialNB)

In [72]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train , y_train)
y_pred = mnb.predict(X_test)

### MultinomialNB Model performance

In [73]:
print(accuracy_score(y_test , y_pred))
print(confusion_matrix(y_test , y_pred))
print(classification_report(y_test , y_pred))

0.5741701909521979
[[   63   528   803    21     1]
 [   33  1096  4246   150     2]
 [    5   416 14504   704    10]
 [    1    62  4448  2142    54]
 [    0     3   810   994   116]]
              precision    recall  f1-score   support

           0       0.62      0.04      0.08      1416
           1       0.52      0.20      0.29      5527
           2       0.58      0.93      0.72     15639
           3       0.53      0.32      0.40      6707
           4       0.63      0.06      0.11      1923

    accuracy                           0.57     31212
   macro avg       0.58      0.31      0.32     31212
weighted avg       0.57      0.57      0.51     31212



## GPT 3 Sentiment Analysis through OpenAI

Reference:

https://www.cronj.com/blog/impact-of-gpt-3-on-text-classification-sentiment-analysis/

https://www.width.ai/post/twitter-sentiment-analysis-using-gpt3

In [89]:
import openai
openai.api_key = "Your_API_KEY"

In [90]:
def create_sentiment_prompt(text):

    return f"Please analyze the sentiment of the following text and classify each line as 0 - negative, 1 - somewhat negative, 2 - neutral, 3 - somewhat positive, 4 - positive and \
    return in an array format, : \n {text}"

In [91]:
def analyze_sentiment(text):

    prompt = create_sentiment_prompt(text)

    response = openai.Completion.create(

        engine="text-davinci-003",

        prompt=prompt,

        max_tokens=1024,

        n=1,

        stop=None,

        temperature=0.1,

    )

    sentiment = response.choices[0].text.strip()


    return sentiment

In [92]:
X = df.Phrase
y = df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [93]:
y_pred = []

# due to free API key only can call 60 times epr minutes, we call 60 times, each time classifying 20 phrases, total 1200 values
for i in range(61):
    sentiment_result = analyze_sentiment(X_test[i:i+21])
    y_pred.extend(list(sentiment_result.replace(', ', '').replace('[', '').replace(']', '').replace(',', '').strip()))


y_pred = list(map(int, y_pred))
print(f"Sentiment: {y_pred}")

In [97]:
# GPT 3 sometimes can ignore a small amount of inputs and do not output prediction, so the final y_pred is not 1200 exactly 
cm = confusion_matrix(y_test[0: 1019], y_pred)
print(cm)
print(accuracy_score(y_test[0: 1019] , y_pred))
print(classification_report(y_test[0: 1019] , y_pred))

[[  7   6  11   8  14]
 [ 40  22  47  49  37]
 [ 95  49 134  98 138]
 [ 33  22  57  43  51]
 [  7   4  23  12  12]]
0.2139352306182532
              precision    recall  f1-score   support

           0       0.04      0.15      0.06        46
           1       0.21      0.11      0.15       195
           2       0.49      0.26      0.34       514
           3       0.20      0.21      0.21       206
           4       0.05      0.21      0.08        58

    accuracy                           0.21      1019
   macro avg       0.20      0.19      0.17      1019
weighted avg       0.34      0.21      0.25      1019

