# Sentimen Annotator

This notebook is used to annotate the sentiment of the tweets using TextBlob, VADER and OpenAI API.


### Setup

In [20]:
import os, json, re, contractions
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from dotenv import load_dotenv
import openai

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

model_chat = "gpt-3.5-turbo"    # Model that powers the ChatGPT
model_davinci = "text-davinci-003"  # Best model
model_curie = "text-curie-001"  # Second-best, but faster (May be suitable for sentiment classification)

input_path = "../data/samples/"
output_path = "../data/labelled/"

### Preprocessing

In [21]:
# Load slang dictionary
slang_path = "../data/slang.json"
with open(slang_path, "r") as f:
    slang_dicts = json.load(f)
    for slang in slang_dicts:
        contractions.add(slang, slang_dicts[slang])

def preprocess(row):
    content = row["content"]
    # Remove Aliases of Usernames and URLs in tweets
    pattern_name = r"\n+|(USERNAME_\d*\s*)"
    content = re.sub(pattern_name, "", content)
    pattern_url = r"URL_\d*"
    content = re.sub(pattern_url, "URL", content)
    # Convert contractions to full form
    content = contractions.fix(content)
    return {'content': content}

def insert_tweets(tweets_list):
    result = ""
    for i, tweet in enumerate(tweets_list):
        result += f"{i}.\n{tweet['content']}\n"
    return result


In [22]:
input_filename = "samples_230227-230308_#577.json"
with open("/home/p11333at/nlp-project/data/golden/positive_230227-230314_#153.json", "r") as f:
# with open(input_path+input_filename, "r") as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

preprocessed_data = [preprocess(row) for row in data]

## Annotate sentiment

#### Using `TextBlob`

In [23]:
# TextBlob

def get_textblob_sentiment(tweet):
    sen = TextBlob(tweet["content"])
    if sen.sentiment.polarity > 0:
        return 2    # Positive
    elif sen.sentiment.polarity < 0:
        return 0    # Negative
    else:
        return 1    # Neutral

textblob_sentiments = [get_textblob_sentiment(tweet) for tweet in preprocessed_data]

#### Using `VADER`

In [9]:
# VADER

def get_vader_sentiment(tweet):
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    sentiment_dict = sid_obj.polarity_scores(tweet['content'])
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        return 2    # Positive
    elif sentiment_dict['compound'] <= -0.05 :
        return 0    # Negative
    else :
        return 1    # Neutral

vader_sentiments = [get_vader_sentiment(tweet) for tweet in preprocessed_data]

### Using `OpenAI`

This is a little bit complicated than before two method.

#### `GPT-3.5 (ChatGPT)` model

In [4]:
# OpenAI GPT-3.5 (ChatGPT) model

def classify_sentiment(tweets_list):
    messages = [
        {"role": "system", "content": "You are an assistant that classify the sentiment in tweets, using 0, 1, 2 to represent negative, neutral and positive respectively. The result must be json format with curly brackets, and property name must be the given index number enclosed in double quotes."},
        {"role": "user", "content": f'Classify the sentiment of the following tweets: "{insert_tweets(tweets_list)}"'}
    ]

    response = openai.ChatCompletion.create(
        model=model_chat,
        messages=messages,
        temperature=0,
    )

    # results = json.loads(response['choices'][0]['message']['content'])
    results = response['choices'][0]['message']['content']
    return results

#### `Davinci` model

In [48]:
# OpenAI Davinci model

# prompt = f'''Classify the sentiment in these tweets, using 0, 1, 2 to represent positive, neutral and negative respectively. The result must be json format with curly brackets, and property name must be the given index number enclosed in double quotes.

# {insert_tweets(preprocessed_data[:5])}

# The result should be json format.
# '''

# response = openai.Completion.create(
#       model=model_davinci,
#       prompt=prompt,
#       temperature=0,
#       max_tokens=500
#     )

# results = json.loads(response["choices"][0]["text"])
# results

In [5]:
# Get the index

print(f'The length of data: {len(preprocessed_data)}')

num_each_time = 40

iteration, last_num_tweets = len(preprocessed_data)//num_each_time+1, len(preprocessed_data)%num_each_time
print(f"Number each time: {num_each_time} \nIteration: {iteration} \nLast iteration's num: {last_num_tweets}")
print(f'{len(preprocessed_data)} = {iteration-1} * {num_each_time} + {last_num_tweets}')


index = [ i*num_each_time for i in range(iteration)]

if last_num_tweets > 0:
    index += [len(preprocessed_data)]

print()
print(index)

The length of data: 306
Number each time: 40 
Iteration: 8 
Last iteration's num: 26
306 = 7 * 40 + 26

[0, 40, 80, 120, 160, 200, 240, 280, 306]


In [6]:
# Store the raw results (string format)

results = []
for i in range(len(index)-1):
    l, r = index[i], index[i+1]
    slice_tweets_list = preprocessed_data[l:r]
    print(f"{l} -> {r}  (Expected: {len(slice_tweets_list)})", end="\r")
    res = classify_sentiment(slice_tweets_list)
    # print(f"  Actual: {len(res)}")
    results.append(res)

280 -> 306  (Expected: 26)

In [7]:
# Convert the raw results to json format
# Do the convertion here to handle the error if openai API returns response in wrong format (sometimes happens)

openai_sentiments = []
for i in range(len(results)):
    # Convert the raw results to json format
    json_res = json.loads(results[i])
    # Retrieve the sentiment values
    openai_sentiments.extend(json_res.values())

## Store results in file

As the three model may not be run at the same time, I wrote the code to store the result one by one.

This needs to be manually setup. (Sorry for the inconvenience)

In [24]:
sentiments = openai_sentiments
# sentiments = vader_sentiments
# sentiments = textblob_sentiments
for i in range(len(sentiments)):
    sentiment = sentiments[i]

    if 'sentiment' not in data[i].keys():
        data[i].update(
            { "sentiment" : {} }
        )

    data[i]["sentiment"].update(
        { "openai" : sentiment }
        # { "vader" : sentiment }
        # { "textblob" : sentiment }
    )


In [26]:
# Reorder the data (Unnecessary)
for line in data:
    o = line['sentiment']['openai']
    v = line['sentiment']['vader']
    t = line['sentiment']['textblob']
    h = line['sentiment']['human']

    senti = {"openai": o, "vader": v, "textblob": t, "human": h}
    line['sentiment'] = senti

#### Write into file

In [27]:
with open("/home/p11333at/nlp-project/data/golden/new.json", "w") as f:
# with open(output_path+"labelled_"+input_filename, "w") as f:
    for line in data:
        json.dump(line, f)
        f.write('\n')

THE END

## *(Please ignore the following part)*

As the data annotation platform I'm using will change the JSON file structure, cells below are used to re-format JSON file after manually annotating the sentiment.

### Manipulate data & file 

In [3]:
labelled_samples_path = "/home/p11333at/nlp-project/data/golden/golden_230227-230308_#577.json"

with open(labelled_samples_path, 'r') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

In [None]:
# sentiment = [line['sentiment'] for line in data if 'human' in line['sentiment'].keys()]
sentiment = [line['sentiment'] for line in data]

openai = np.array([s['openai'] for s in sentiment])
vader = np.array([s['vader'] for s in sentiment])
textblob = np.array([s['textblob'] for s in sentiment])
human = np.array([s['human'] for s in sentiment])

In [14]:
def count_labels(labels):
    neg = np.count_nonzero(labels == 0)
    neu = np.count_nonzero(labels == 1)
    pos = np.count_nonzero(labels == 2)
    return (neg, neu, pos)

def calculate_accuracy(pred, true):
    return np.count_nonzero(pred == true) / len(true)

print("openai", count_labels(openai))
print("vader", count_labels(vader))
print("textblob", count_labels(textblob))
print("human", count_labels(human))

print()

acc_openai = calculate_accuracy(openai, human)
acc_vader = calculate_accuracy(vader, human)
acc_textblob = calculate_accuracy(textblob, human)
print("openai", acc_openai)
print("vader", acc_vader)
print("textblob", acc_textblob)

openai (961, 361, 444)
vader (886, 279, 601)



In [15]:
calculate_accuracy(openai, vader)

0.7644394110985278

In [16]:
o = (openai == 2).nonzero()[0]
v = (vader == 2).nonzero()[0]

double = np.intersect1d(o, v)
len(double)

383

In [17]:
o = (openai == 2).nonzero()[0]
v = (vader == 1).nonzero()[0]

single_o = np.intersect1d(o, v)
len(single_o)

11

In [18]:
o = (openai == 1).nonzero()[0]
v = (vader == 2).nonzero()[0]

single_v = np.intersect1d(o, v)
len(single_v)

90

In [25]:
# with open(output_path+"DOUBLE_positive_230227-230314_#383.json", "w") as f:
with open(output_path+"SINGLE_positive_230227-230314_#101.json", "w") as f:
    for i in single_o:
        json.dump(data[i], f)
        f.write('\n')
    for i in single_v:
        json.dump(data[i], f)
        f.write('\n')

### Convert annotated data to formal format

In [15]:
# labelled_samples_path = "/home/p11333at/nlp-project/data/golden/golden_230227-230308_#577.json"
labelled_samples_path = "/home/p11333at/nlp-project/data/labelled/all.json"

with open(labelled_samples_path, 'r') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

In [17]:
new_data = []

for i, line in enumerate(data):
    creation_data = line['creation_date']
    content = line['text']
    sentiment = line['sentiment']
    try:
        label = line['label'][0]
    except:
        print("[index] :", i)
        print(content)
        break

    if label == 'Positive':
        label = 2
    # elif label == 'Neutral':
    #     label = 1
    # elif label == 'Negative':
    #     label = 0
    else:
        continue
        # raise Exception("Label is not valid")
    
    sentiment.update({"human": label})

    new_data.append({
        "creation_date": creation_data,
        "content": content,
        "sentiment": sentiment,
    })


# with open(labelled_samples_path, 'r') as f:
#     data = f.readlines()
#     data = [json.loads(line) for line in data]

In [19]:
# with open(labelled_samples_path, 'w') as f:
with open("/home/p11333at/nlp-project/data/golden/positive_230227-230314_#156.json", 'w') as f:
    for line in new_data:
        json.dump(line, f)
        f.write('\n')