# DEMO: Sentiment Classification of Twitter Data

In [3]:
import kagglehub
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_path = "twitter_data/the_climate_change_twitter_dataset.csv"

df = pd.read_csv(data_path)

In [5]:
df.head(10)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
0,2006-06-06 16:06:42+00:00,6132,,,Weather Extremes,-0.09718,neutral,female,,aggressive
1,2006-07-23 21:52:30+00:00,13275,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
2,2006-08-29 01:52:30+00:00,23160,,,Weather Extremes,0.500479,neutral,male,,aggressive
3,2006-11-07 02:46:52+00:00,57868,,,Weather Extremes,0.032816,neutral,male,,aggressive
4,2006-11-27 14:27:43+00:00,304553,,,Importance of Human Intervantion,-0.090428,neutral,male,,aggressive
5,2006-11-29 23:21:04+00:00,454763,,,Seriousness of Gas Emissions,-0.283467,neutral,male,,aggressive
6,2006-12-11 22:08:14+00:00,971753,,,Ideological Positions on Global Warming,-0.046626,believer,male,,not aggressive
7,2006-12-14 01:39:10+00:00,1092823,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,neutral,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023,-79.79198,36.07264,Weather Extremes,-0.565028,denier,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543,-121.80579,38.00492,Weather Extremes,0.65096,neutral,male,-1.652156,not aggressive


In [6]:
topics = df["topic"].unique()

In [7]:
topics

array(['Weather Extremes', 'Importance of Human Intervantion',
       'Seriousness of Gas Emissions',
       'Ideological Positions on Global Warming',
       'Impact of Resource Overconsumption', 'Global stance', 'Politics',
       'Significance of Pollution Awareness Events',
       'Donald Trump versus Science', 'Undefined / One Word Hashtags'],
      dtype=object)

In [8]:
# sentiment clustering per topic
def sentiment_per_topic(df, topic):
    topic_df = df[df["topic"] == topic]
    sentiment_counts = topic_df["sentiment"].value_counts()
    return sentiment_counts

In [9]:
for topic in topics:
    sentiment_counts = sentiment_per_topic(df, topic)
    print(f"Sentiment counts for topic '{topic}':")
    print(sentiment_counts)
    print()

Sentiment counts for topic 'Weather Extremes':
sentiment
 0.024858    64258
-0.666741    63668
-0.620578    63257
 0.715171    28513
 0.335571    27947
             ...  
-0.494849        1
 0.350764        1
-0.300648        1
 0.748248        1
-0.411218        1
Name: count, Length: 1685498, dtype: int64

Sentiment counts for topic 'Importance of Human Intervantion':
sentiment
-0.150846    52600
-0.401700    20522
 0.906971    13825
 0.732620    11027
 0.528607    10975
             ...  
-0.520542        1
 0.384815        1
 0.494044        1
-0.634825        1
 0.136584        1
Name: count, Length: 1392627, dtype: int64

Sentiment counts for topic 'Seriousness of Gas Emissions':
sentiment
 0.483494    34781
-0.341937    11307
-0.586555     8986
-0.312827     5875
-0.103262     4049
             ...  
 0.060408        1
-0.135876        1
 0.496893        1
-0.508622        1
-0.056624        1
Name: count, Length: 578521, dtype: int64

Sentiment counts for topic 'Ideological Pos

In [10]:
df2 = pd.read_csv("twitter_data/twitter_sentiment_data.csv")

In [11]:
df2.head(10)

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153
5,0,Unamshow awache kujinga na iko global warming ...,793125429418815489
6,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125430236684289
7,2,RT @CCIRiviera: Presidential Candidate #Donald...,793126558688878592
8,0,RT @AmericanIndian8: Leonardo DiCaprio's clima...,793127097854197761
9,1,#BeforeTheFlood Watch #BeforeTheFlood right he...,793127346106753028


### Drafts for 2 separate functions for applying sent analysis:

In [None]:
def apply_twitter_sentiment_analysis(self, twitter_df):
    """Apply sentiment analysis specifically for Twitter data"""
    logger.info(f"Applying sentiment analysis to {len(twitter_df)} Twitter posts...")

    # Ensure the 'message' column exists
    if 'message' not in twitter_df.columns:
        logger.error("Twitter dataset must contain a 'message' column.")
        raise ValueError("Twitter dataset must contain a 'message' column.")

    # Initialize sentiment columns
    for col in ['compound', 'pos', 'neu', 'neg', 'sentiment_category']:
        twitter_df[f'sentiment_{col}'] = None

    # Process in batches
    batch_size = 1000
    for i in tqdm(range(0, len(twitter_df), batch_size), desc="Analyzing Twitter sentiment"):
        batch = twitter_df.iloc[i:i+batch_size].copy()
        sentiments = batch['message'].apply(self.analyze_sentiment)

        # Extract sentiment scores and categories
        batch['sentiment_compound'] = sentiments.apply(lambda x: x['compound'])
        batch['sentiment_pos'] = sentiments.apply(lambda x: x['pos'])
        batch['sentiment_neu'] = sentiments.apply(lambda x: x['neu'])
        batch['sentiment_neg'] = sentiments.apply(lambda x: x['neg'])
        batch['sentiment_category'] = sentiments.apply(lambda x: x['sentiment_category'])

        # Update the original DataFrame
        twitter_df.iloc[i:i+batch_size, twitter_df.columns.get_indexer(batch.columns)] = batch

    logger.info("Twitter sentiment analysis complete.")
    return twitter_df

In [None]:
def apply_reddit_sentiment_analysis(self, reddit_df, text_column='cleaned_text'):
    """Apply sentiment analysis specifically for Reddit data"""
    logger.info(f"Applying sentiment analysis to {len(reddit_df)} Reddit items...")

    # Ensure the text column exists
    if text_column not in reddit_df.columns:
        logger.error(f"Reddit dataset must contain a '{text_column}' column.")
        raise ValueError(f"Reddit dataset must contain a '{text_column}' column.")

    # Initialize sentiment columns
    for col in ['compound', 'pos', 'neu', 'neg', 'sentiment_category']:
        reddit_df[f'sentiment_{col}'] = None

    # Process in batches
    batch_size = 1000
    for i in tqdm(range(0, len(reddit_df), batch_size), desc="Analyzing Reddit sentiment"):
        batch = reddit_df.iloc[i:i+batch_size].copy()
        sentiments = batch[text_column].apply(self.analyze_sentiment)

        # Extract sentiment scores and categories
        batch['sentiment_compound'] = sentiments.apply(lambda x: x['compound'])
        batch['sentiment_pos'] = sentiments.apply(lambda x: x['pos'])
        batch['sentiment_neu'] = sentiments.apply(lambda x: x['neu'])
        batch['sentiment_neg'] = sentiments.apply(lambda x: x['neg'])
        batch['sentiment_category'] = sentiments.apply(lambda x: x['sentiment_category'])

        # Update the original DataFrame
        reddit_df.iloc[i:i+batch_size, reddit_df.columns.get_indexer(batch.columns)] = batch

    logger.info("Reddit sentiment analysis complete.")
    return reddit_df

In [None]:
def run_complete_analysis(self, reddit_posts_file=None, reddit_comments_file=None, twitter_file=None):
    """
    Run complete sentiment analysis workflow and generate all outputs
    """
    outputs = {}

    # 1. Load data
    logger.info("Loading data files...")
    data = self.load_data(reddit_posts_file, reddit_comments_file, twitter_file)

    # 2. Apply sentiment analysis
    analyzed_data = {}
    if 'twitter' in data and data['twitter'] is not None:
        analyzed_data['twitter'] = self.apply_twitter_sentiment_analysis(data['twitter'])

    if 'reddit_posts' in data and data['reddit_posts'] is not None:
        analyzed_data['reddit_posts'] = self.apply_reddit_sentiment_analysis(data['reddit_posts'])

    if 'reddit_comments' in data and data['reddit_comments'] is not None:
        analyzed_data['reddit_comments'] = self.apply_reddit_sentiment_analysis(data['reddit_comments'])

    # Combine Reddit posts and comments if both are present
    if 'reddit_posts' in analyzed_data and 'reddit_comments' in analyzed_data:
        logger.info("Combining Reddit posts and comments for analysis...")
        analyzed_data['reddit_combined'] = self.combine_reddit_data(
            analyzed_data['reddit_posts'], 
            analyzed_data['reddit_comments']
        )

Run scripts:
```bash
python sentiment_analyzer.py --reddit-posts data/raw/reddit_posts_20250506_180725.csv --reddit-comments data/raw/reddit_comments_20250506_180008.csv  --twitter twitter_data/twitter_sentiment_data.csv

```


updated path:

```bash
python sentiment_analyzer.py --reddit-posts data/raw/reddit_posts_20250506_180725.csv --reddit-comments data/raw/reddit_comments_20250506_180008.csv  --twitter /Users/blueberry/Library/CloudStorage/OneDrive-UniversitätZürichUZH/Studium/FS25/SoComp/SC_RP/rp_implementation/twitter_data/twitter_sentiment_data.csv
```