# Loading libraries, Data frame, VADERS and testing it 

In [None]:
# maybe ssl and certifi are not needed for you guys but still I will include them

import ssl
import nltk
import certifi
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import numpy as np

In [None]:
# Loading dataframe (I am using the mock file in this notebook) 

conversations = pd.read_csv("mock_df_conversation.csv")
conversations.head(10)

In [None]:
# I had to run the first two lines of code to be able to install VADER, but you may not need them. Just the last two lines

# Configure SSL context to use certifis CA bundle
ssl_context = ssl.create_default_context(cafile=certifi.where())
ssl._create_default_https_context = lambda: ssl_context

# Download the vader_lexicon data
nltk.download('vader_lexicon')

analyser = SentimentIntensityAnalyzer()

In [None]:
# Example of how VADER works 

text1 = "I love Python!"

text2 = "I love Python"

scores = analyser.polarity_scores(text1)

scores1 = analyser.polarity_scores(text2)

print(scores)
print(scores1)

In [None]:
# No need to run because it does not work
# WORD CLOUD
# I can't make it work for some reason, you can even scratch the code and make the word cloud yourself.

# Concatenate all text data into a single string
data = " ".join(conversations['text'])

# On Windows, you can use: "C:\\Windows\\Fonts\\Arial.ttf"
# On macOS, you can use: "/Library/Fonts/Arial.ttf"
# On Linux, you can use: "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
font_path = "/System/Library/Fonts/Supplemental/Arial Unicode.ttf" # This may need to be changed based on your laptop and os

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(data)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hide axes
plt.show()

# Data preprocessing and sentiment analysis

In [None]:
# Don't run this is just a suggestion for now
# I don't know if we should do this
# Iterating over each row in the 'text' column and checking if the text starts with a '@' symbol, indicating a mention. If a mention is found, we are removing the mention by finding the index of the first space character after the '@' symbol and retains the text following that space.

conversations['text'] = conversations['text_']

for i in range(len(conversations['text'])):
    str_val = conversations['text'].iloc[i]
    if str_val.startswith("@"):
        first_idx = str_val.index(" ") + 1
        conversations.loc[i, 'text'] = str_val[first_idx:]

conversations.drop(columns=['text_'], inplace=True)

conversations.head()

In [None]:
# Data Cleaning (only remove URLs)
# Overrides the existing column "cleaned_text" 

# Function to clean tweet text without removing special characters
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    return text

# Apply cleaning function to the text column
conversations['cleaned_text'] = conversations['text'].apply(clean_tweet_text)

In [None]:
# Sentiment analysis (only on english tweets, other languages get a NaN value)
# Creating the 'sentiment_score' column (in integers)

# Function to get sentiment score
def get_sentiment_score(text, lang):
    if lang == 'en':
        return analyser.polarity_scores(text)['compound']
    else:
        return np.nan

# Apply sentiment analysis conditionally based on language
conversations['sentiment_score'] = conversations.apply(lambda row: get_sentiment_score(row['cleaned_text'], row['lang']), axis=1)

In [None]:
# Creates the "sentiment" column (string format) 
# Other languages have a NaN value

# Function to categorize sentiment based on score
def categorize_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply categorization function to the sentiment score column
conversations['sentiment'] = conversations['sentiment_score'].apply(categorize_sentiment)

In [None]:
conversations.head(10)

# Small exploration of sentiment analysis

In [None]:
# Visualisation of the amount of tweets per "sentiment" 
# It is not "accurate", because the majority of the tweets are automatically neutral if they are not in english

# Prepare data for visualization
sentiment_counts = conversations['sentiment'].value_counts()

# Plot the data
plt.figure(figsize=(8, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['red', 'green', 'blue'])
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.title('Sentiment Analysis of Tweets')
plt.show()

In [None]:
# Some statistics

# Compute the average sentiment score
average_sentiment_score = conversations['sentiment_score'].mean()
print(f"Average Sentiment Score: {average_sentiment_score}")

# Count positive and negative scores
positive_count = (conversations['sentiment_score'] > 0.05).sum()
negative_count = (conversations['sentiment_score'] < -0.05).sum()

print(f"Number of Positive Scores: {positive_count}")
print(f"Number of Negative Scores: {negative_count}")

In [None]:
# Maybe a useless plot can be changed when we have the whole dataset

# Plot the change in sentiment scores over time
plt.figure(figsize=(10, 6))
plt.plot(conversations['created_at_datetime'], conversations['sentiment_score'], marker='o', linestyle='-', color='b')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.title('Change in Sentiment Score Over Time')
plt.grid(True)
plt.show()