<a href="https://colab.research.google.com/github/AstridZhao/Spring2023DeepLearning/blob/main/Version2_Deep_Learning_Final_Project_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install tensorflow transformers datasets
!pip install pytreebank


# Sentiment classification

In [None]:
import pytreebank
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import numpy as np
import time
import string

# Data acquisition

Import dataset.

In [None]:
# Load the dataset from disk
dataset = pytreebank.load_sst("/path/to/sentiment/")

train = dataset['train']
validation = dataset['dev']
test = dataset['test']

Due to the costs of running ChatGPT queries, we limit the number of training and test samples. 

In [None]:
limit = 10

In [None]:
X_train = []
Y_train = []

train_data = []
for tree in train:
  datapoint = tree.to_labeled_lines()[0]
  train_data.append(datapoint)
  
train_data = train_data[:limit]

for i in range(len(train_data)):
  for label, sentence in train_data:
    X_train.append(sentence)
    Y_train.append(label)

In [None]:
X_valid = []
Y_valid = []

valid_data = []
for tree in validation:
  datapoint = tree.to_labeled_lines()[0]
  valid_data.append(datapoint)

valid_data = valid_data[:limit]

for i in range(len(valid_data)):
  for label, sentence in valid_data:
    X_valid.append(sentence)
    Y_valid.append(label)

In [None]:
X_test = []
Y_test = []

test_data = []
for tree in test:
  datapoint = tree.to_labeled_lines()[0]
  test_data.append(datapoint)

test_data = test_data[:limit]

for i in range(len(test_data)):
  for label, sentence in test_data:
    X_test.append(sentence)
    Y_test.append(label)


# Data Preparation

In [None]:
def create_prompt(sentence, label, prediction=False):
    result = ""
    if label == 0 : sentiment = "very negative" 
    elif label == 1: sentiment = "negative"
    elif label == 2: sentiment = "neutral" 
    elif label == 3: sentiment = "positive" 
    elif label == 4: sentiment = "very positive" 

    result    = f"Sentiment analysis, input text: [{sentence}]\nSentiment: ["
    if prediction==False:
      result  += f"{sentiment}]\n"
    return result

formatted_prompts_train = [create_prompt(sentence, sentiment) for sentence, sentiment in list(zip(X_train, Y_train))]
formatted_prompts_test  = [create_prompt(sentence, sentiment) for sentence, sentiment in list(zip(X_test, Y_test))]

In [None]:
formatted_prompts_test[:10]

['Sentiment analysis, input text: [Effective but too-tepid biopic]\nSentiment: [neutral]\n',
 'Sentiment analysis, input text: [If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .]\nSentiment: [positive]\n',
 "Sentiment analysis, input text: [Emerges as something rare , an issue movie that 's so honest and keenly observed that it does n't feel like one .]\nSentiment: [very positive]\n",
 'Sentiment analysis, input text: [The film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .]\nSentiment: [neutral]\n',
 'Sentiment analysis, input text: [Offers that rare combination of entertainment and education .]\nSentiment: [very positive]\n',
 'Sentiment analysis, input text: [Perhaps no picture ever made has more literally showed that the road to hell is paved with good intentions .]\nSentiment: [positive]\n',
 "Sentiment analysis, input text: [Steers turns in a snappy screenpla

In [None]:
%%capture
!pip install datasets openai
import openai

openai.api_key = "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" # keep it safe

def get_completion(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.0, n=1):
    messages = [{"role": "user", "content": prompt}]
    if model=="gpt-3.5-turbo":
      response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        n=n)
      result = response.choices[0].message["content"]
    else:
      response = openai.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=n          
      )
      result = response.choices[0].text
    return result

# Evaluation

In [None]:
def evaluate(Y_true,Y_preds):
  print(classification_report(Y_true,Y_preds))
  conf = confusion_matrix(Y_true,Y_preds)
  sns.heatmap(conf, fmt='d', annot=True)

We choose to use Few-shot learning method. 
It is to give a longer prompt and training examples as a prompt for a chat model, and then we could predict the sentiment of the test sentences.

In [None]:
instructions = "\n".join(formatted_prompts_train[:80])
prompt = "You are a movie review expert and identifying the sentiment of review. "\
         " I will define the input text of review sentence in the first bracket[], and then the sentiment is in the second bracket[] between\n and \n tags."\
        "Be as precise, as you can. Identify the sentiment of the input text defined in brackets, " \
          "and return the sentiment in brackets too. The return value should be positive or negative, " \
          " nothing else. Here are some examples, how to do it:\n"
# prompt =  "You are a movie review expert and identifying the sentiment of review. "\
#           "Identifying the sentiment of the input text defined in brackets, " \
#           "and returning the sentiment result in brackets too. The return value should be" \
#           "ONLY the words of one of five words following: " \
#           "positive, very positive, neutral, very negative, negative."\
#           "Here are some examples, how to do it:\n"

# " Your job is returning the sentiment result in the brackets. " \
# " The return value should be ONLY the words of one of five words in the following curly brackets: {positive, very positive, neutral, very negative, negative}" \
          
prompt += instructions


In [None]:
prompt += formatted_prompts_train[0]
get_completion(prompt , model="gpt-3.5-turbo") # initialize a chatgpt model with the prompt

'Sentiment analysis, input text: [The acting in this movie was terrible and the plot was completely nonsensical.]\nSentiment: [negative]'

In [None]:
preds_text2 = []
counter=0 # to be able to continue, if the connection breaks

In [None]:
for i in range(0,80):
   result = ''
   prompt = "You are a movie review expert and identifying the sentiment of review. "\
         " I will define the input text of review sentence in the first bracket[], and then the sentiment is in the second bracket[] between\n and \n tags."\
        "Be as precise, as you can. Identify the sentiment of the input text defined in brackets, " \
          "and return the sentiment in brackets too. The return value should be positive or negative, " \
          " nothing else. Here are some examples, how to do it:\n"
   prompt += formatted_prompts_test[i]
   result = get_completion(prompt, model="gpt-3.5-turbo")
   preds_text2.append(result)
   

In [None]:
def convert_text_to_label(text):
  pos = ['positive','very positive','happy','nostalgic','Amazing','Fantastic', 'Excellent', 'Great', 'Outstanding', 'Brilliant', 'Superb', 'Terrific', 'Impressive', 'Delightful']
  neg = ['negative','very negative','frustration','sad','concerned','embarrassment','Horrible', 'Disappointing', 'Bad', 'Mediocre', 'Dreadful', 'Abysmal', 'Atrocious', 'Lousy']
  net = ['neutral','mixed','Okay', 'Average', 'Decent', 'Fair', 'Ordinary', 'Fine', 'Acceptable', 'Satisfactory', 'Balanced', 'Unremarkable']
  if any([x in text.lower() for x in pos]): 
    return 1
  elif any([x in text.lower() for x in neg]): 
    return -1
  elif any([x in text.lower() for x in net]): 
    return 0
  else:
    return None

In [None]:
Y_preds2 = [convert_text_to_label(t) for t in preds_text2]
Y_preds2 = np.array(Y_preds2)
Y_test   = np.array(Y_test)

In [None]:
Y_test_filtered2  = Y_test[(Y_preds2==1) | (Y_preds2==-1)]
Y_preds_filtered2 = Y_preds2[(Y_preds2==1) | (Y_preds2==-1)].astype(int)
Y_preds_filtered2[Y_preds_filtered2==-1]=0

In [None]:
evaluate(Y_test_filtered2,Y_preds_filtered2)