# Sentiment Analysis

## 1. Environment Setup

In [None]:
# Install packages
!pip install transformers

In [None]:
# Import Packages
import pandas as pd
from transformers import pipeline
import re
from google.colab import files

## 2. Load Data

In [None]:
# Comments Data

# Load data in from files
fp = "2.2_comments_data.xlsx"
comments_data = pd.read_excel(fp, header=0)

# View the data frame to get a quick overview
comments_data

In [None]:
# Submissions Data

# Load data in from files
fp = "2.1_submissions_data.xlsx"
submissions_data = pd.read_excel(fp, header=0)

# View the data frame to get a quick overview
submissions_data

## 3. Sentiment Analysis Model Testing

### 3.0. Specify Test User

In [None]:
# Define a test user
user = comments_data['submission_author'].iloc[0]

# Define test dataset based on the test user
filtered_comments = comments_data[(comments_data['submission_author'] == user) |
                                  (comments_data['parent_comment_author'] == user)]

# Filtering out rows where comment_author=user
filtered_comments = filtered_comments[~(filtered_comments['comment_author'] == user)]

# View df
filtered_comments

### 3.1. Model 1: cardiffnlp/twitter-roberta-base-sentiment-latest

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')

# Function to get sentiment label for a text
def get_sentiment_label(text):
    result = sentiment_analysis(text)
    label = result[0]['label']
    return label

# Function to get sentiment score for a text
def get_sentiment_score(text):
    result = sentiment_analysis(text)
    score = result[0]['score']
    return score

# Apply the sentiment analysis model to each comment_body to get sentiment labels
filtered_comments['sentiment_label1'] = filtered_comments['comment_body'].apply(get_sentiment_label)

# Apply the sentiment analysis model to each comment_body to get sentiment scores
filtered_comments['sentiment_score1'] = filtered_comments['comment_body'].apply(get_sentiment_score)

# Print Results
for index, row in filtered_comments.iterrows():
    print("Comment Body:", row['comment_body'])
    print("Sentiment Label 1:", row['sentiment_label1'])
    print("Sentiment Score 1:", row['sentiment_score1'])
    print()

This model works fairly well. The scores are all over the place, but the labels seem to be valid.

### 3.2. Model 2: mwkby/distilbert-base-uncased-sentiment-reddit-crypto

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='mwkby/distilbert-base-uncased-sentiment-reddit-crypto')

# Function to get sentiment label for a text
def get_sentiment_label(text):
    result = sentiment_analysis(text)
    label = result[0]['label']
    return label

# Function to get sentiment score for a text
def get_sentiment_score(text):
    result = sentiment_analysis(text)
    score = result[0]['score']
    return score

# Apply the sentiment analysis model to each comment_body to get sentiment labels
filtered_comments['sentiment_label2'] = filtered_comments['comment_body'].apply(get_sentiment_label)

# Apply the sentiment analysis model to each comment_body to get sentiment scores
filtered_comments['sentiment_score2'] = filtered_comments['comment_body'].apply(get_sentiment_score)

# Print Results
for index, row in filtered_comments.iterrows():
    print("Comment Body:", row['comment_body'])
    print("Sentiment Label 2:", row['sentiment_label2'])
    print("Sentiment Score 2:", row['sentiment_score2'])
    print()

The scores are high and don't have much variability, which is a bad sign. Labels could work but there are too many positives and no neutrals showed up which is, again, a bad sign. This model didn't work very well.

### 3.3. Model 3: minh21/XLNet-Reddit-Sentiment-Analysis

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='minh21/XLNet-Reddit-Sentiment-Analysis')

# Function to get sentiment label for a text
def get_sentiment_label(text):
    result = sentiment_analysis(text)
    label = result[0]['label']
    return label

# Function to get sentiment score for a text
def get_sentiment_score(text):
    result = sentiment_analysis(text)
    score = result[0]['score']
    return score

# Apply the sentiment analysis model to each comment_body to get sentiment labels
filtered_comments['sentiment_label3'] = filtered_comments['comment_body'].apply(get_sentiment_label)

# Apply the sentiment analysis model to each comment_body to get sentiment scores
filtered_comments['sentiment_score3'] = filtered_comments['comment_body'].apply(get_sentiment_score)

# Print Results
for index, row in filtered_comments.iterrows():
    print("Comment Body:", row['comment_body'])
    print("Sentiment Label 3:", row['sentiment_label3'])
    print("Sentiment Score 3:", row['sentiment_score3'])
    print()

The labels of this model seem to be promising, however, there is no reliable documentation to be found where their meaning is defined.This means this model is rather unreliable.

### 3.4. Model 4: akshataupadhye/finetuning-sentiment-model-reddit-data

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='akshataupadhye/finetuning-sentiment-model-reddit-data')

# Function to get sentiment label for a text
def get_sentiment_label(text):
    result = sentiment_analysis(text)
    label = result[0]['label']
    return label

# Function to get sentiment score for a text
def get_sentiment_score(text):
    result = sentiment_analysis(text)
    score = result[0]['score']
    return score

# Apply the sentiment analysis model to each comment_body to get sentiment labels
filtered_comments['sentiment_label4'] = filtered_comments['comment_body'].apply(get_sentiment_label)

# Apply the sentiment analysis model to each comment_body to get sentiment scores
filtered_comments['sentiment_score4'] = filtered_comments['comment_body'].apply(get_sentiment_score)

# Print Results
for index, row in filtered_comments.iterrows():
    print("Comment Body:", row['comment_body'])
    print("Sentiment Label 4:", row['sentiment_label4'])
    print("Sentiment Score 4:", row['sentiment_score4'])
    print()

Same issue as with the previous model.

In sum, the first model seems to be the most reliable due to three things.

1. This was trained on the largest amount of data, namely on ~124M tweets.

2. This has the most comprehensive documentation.

3. The labels of this model were proven to be the most accurate, validated by human judgement.

## 4. Run the selected model

### 4.1. Comments

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    if isinstance(text, str):
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    else:
        return ""

comments_data['comment_body_prep'] = comments_data['comment_body'].apply(preprocess)

comments_data.head()

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')

# Function to map sentiment labels to numeric values
def map_sentiment_label(sentiment):
    if sentiment == 'negative':
        return -1
    elif sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1
    else:
        return None  # Handle unknown sentiment labels

# Get sentiment label for a text, handling too long comments
def get_sentiment_label(text):
    try:
        result = sentiment_analysis(text)
        label = result[0]['label']
        return map_sentiment_label(label)
    except Exception as e:
        return "NA"

# Apply the sentiment analysis model
comments_data['sentiment_label'] = comments_data['comment_body_prep'].apply(get_sentiment_label)

# Print Results
comments_data.head()

### 4.2. Submissions

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    if isinstance(text, str):
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    else:
        return ""

submissions_data['body_prep'] = submissions_data['body'].apply(preprocess)

submissions_data.head()

In [None]:
# Load the sentiment analysis model
sentiment_analysis = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')

# Function to map sentiment labels to numeric values
def map_sentiment_label(sentiment):
    if sentiment == 'negative':
        return -1
    elif sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1
    else:
        return None  # Handle unknown sentiment labels

# Get sentiment label for a text, handling too long comments
def get_sentiment_label(text):
    try:
        result = sentiment_analysis(text)
        label = result[0]['label']
        return map_sentiment_label(label)
    except Exception as e:
        return "NA"

# Apply the sentiment analysis model
submissions_data['sentiment_label'] = submissions_data['body_prep'].apply(get_sentiment_label)

# Print Results
submissions_data.head()

## 5. Save & Export

In [None]:
# Comments
comments_data.to_excel('4.3.1.1_comments_data_sa.xlsx', index=False)

# Download the Excel file
files.download('4.3.1.1_comments_data_sa.xlsx')

In [None]:
# Submissions
submissions_data.to_excel('4.3.1.2_submissions_data_sa.xlsx', index=False)

# Download the Excel file
files.download('4.3.1.2_submissions_data_sa.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>