In [1]:
import os, json, re, contractions
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from dotenv import load_dotenv
import openai

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

model_chat = "gpt-3.5-turbo"    # Model that powers the ChatGPT
model_davinci = "text-davinci-003"  # Best model
model_curie = "text-curie-001"  # Second-best, but faster (May be suitable for sentiment classification)

input_path = "../data/samples/"
output_path = "../data/labelled/"

In [2]:
# Load slang dictionary
slang_path = "../data/slang.json"
with open(slang_path, "r") as f:
    slang_dicts = json.load(f)
    for slang in slang_dicts:
        contractions.add(slang, slang_dicts[slang])

def preprocess(row):
    content = row["content"]
    # Remove Aliases of Usernames and URLs in tweets
    pattern = r"\n+|((?:USERNAME|URL)_\d*\s*)"
    content = re.sub(pattern, "", content)
    # Convert contractions to full form
    content = contractions.fix(content)
    return {'content': content}

def insert_tweets(tweets_list):
    result = ""
    for i, tweet in enumerate(tweets_list):
        result += f"{i}.\n{tweet['content']}\n"
    return result


In [3]:
input_filename = "samples_230227-230308_#577.json"
# with open("/home/p11333at/nlp-project/data/raw/tweets_230227_#63481.json", "r") as f:
with open(input_path+input_filename, "r") as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

preprocessed_data = [preprocess(row) for row in data]

In [6]:
# TextBlob

def get_textblob_sentiment(tweet):
    sen = TextBlob(tweet["content"])
    if sen.sentiment.polarity > 0:
        return 2    # Positive
    elif sen.sentiment.polarity < 0:
        return 0    # Negative
    else:
        return 1    # Neutral

textblob_sentiments = [get_textblob_sentiment(tweet) for tweet in preprocessed_data]

In [4]:
# VADER

def get_vader_sentiment(tweet):
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    sentiment_dict = sid_obj.polarity_scores(tweet['content'])
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        return 2    # Positive
    elif sentiment_dict['compound'] <= - 0.05 :
        return 0    # Negative
    else :
        return 1    # Neutral

vader_sentiments = [get_vader_sentiment(tweet) for tweet in preprocessed_data]

## Make request

In [19]:
def classify_sentiment(tweets_list):
    messages = [
        {"role": "system", "content": "You are an assistant that classify the sentiment in tweets, using 0, 1, 2 to represent negative, neutral and positive respectively. The result must be json format with curly brackets, and property name must be the given index number enclosed in double quotes."},
        {"role": "user", "content": f'Classify the sentiment of the following tweets: "{insert_tweets(tweets_list)}"'}
    ]

    response = openai.ChatCompletion.create(
        model=model_chat,
        messages=messages,
        temperature=0,
    )

    # results = json.loads(response['choices'][0]['message']['content'])
    results = response['choices'][0]['message']['content']
    return results

In [29]:
test = [{'content': "I’m on day 5 of COVID and overall feeling better BUT WHEN DO MY HIPS STOP HURTING????"}]
temp = classify_sentiment(preprocessed_data[:17])
temp

'{\n    "0": 2,\n    "1": 0,\n    "2": 1,\n    "3": 2,\n    "4": 0,\n    "5": 0,\n    "6": 2,\n    "7": 0,\n    "8": 0,\n    "9": 2,\n    "10": 0,\n    "11": 0,\n    "12": 1,\n    "13": 0,\n    "14": 1,\n    "15": 0,\n    "16": 1\n}'

In [30]:
print(temp) #1012002002 00102

{
    "0": 2,
    "1": 0,
    "2": 1,
    "3": 2,
    "4": 0,
    "5": 0,
    "6": 2,
    "7": 0,
    "8": 0,
    "9": 2,
    "10": 0,
    "11": 0,
    "12": 1,
    "13": 0,
    "14": 1,
    "15": 0,
    "16": 1
}


In [48]:
# prompt = f'''Classify the sentiment in these tweets, using 0, 1, 2 to represent positive, neutral and negative respectively. The result must be json format with curly brackets, and property name must be the given index number enclosed in double quotes.

# {insert_tweets(preprocessed_data[:5])}

# The result should be json format.
# '''

# response = openai.Completion.create(
#       model=model_davinci,
#       prompt=prompt,
#       temperature=0,
#       max_tokens=500
#     )

# results = json.loads(response["choices"][0]["text"])
# results

In [5]:
print(f'The length of data: {len(preprocessed_data)}')

num_each_time = 40

iteration, last_num_tweets = len(preprocessed_data)//num_each_time+1, len(preprocessed_data)%num_each_time
print(f"Number each time: {num_each_time} \nIteration: {iteration} \nLast iteration's num: {last_num_tweets}")
print(f'{len(preprocessed_data)} = {iteration-1} * {num_each_time} + {last_num_tweets}')


index = [ i*num_each_time for i in range(iteration)]

if last_num_tweets > 0:
    index += [len(preprocessed_data)]

print()
print(index)

The length of data: 577
Number each time: 40 
Iteration: 15 
Last iteration's num: 17
577 = 14 * 40 + 17

[0, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 577]


In [None]:
# Store the raw results (string format)
results = []
for i in range(len(index)-1):
    l, r = index[i], index[i+1]
    slice_tweets_list = preprocessed_data[l:r]
    print(f"{l} -> {r}  (Expected: {len(slice_tweets_list)})", end="\r")
    res = classify_sentiment(slice_tweets_list)
    # print(f"  Actual: {len(res)}")
    results.append(res)

In [None]:
sentiments = []
for i in range(len(results)):
    # Convert the raw results to json format
    # Do the convertion here to avoid openai returning response in wrong format
    json_res = json.loads(results[i])
    # Retrieve the sentiment values
    sentiments.extend(json_res.values())

## Store results in file

In [6]:
# sentiments = vader_sentiments
for i in range(len(sentiments)):
    sentiment = sentiments[i]

    if 'sentiment' not in data[i].keys():
        data[i].update(
            { "sentiment" : {} }
        )

    data[i]["sentiment"].update(
        { "openai" : sentiment }
        # { "vader" : sentiment }
        # { "textblob" : sentiment }
    )


In [7]:
# with open(output_path+"chatgpt-labelled_"+input_filename, "w") as f:
# with open(output_path+"trail_labelled_230227_#63481.json", "w") as f:
with open(output_path+"labelled_"+input_filename, "w") as f:
    for line in data:
        json.dump(line, f)
        f.write('\n')