In [None]:
import re
import csv
import nltk
import numpy as np
import pandas as panda
from bs4 import BeautifulSoup
from mastodon import Mastodon
from dotenv import dotenv_values
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

env = dotenv_values()

mastodon = Mastodon(
    access_token=env['ACCESS_TOKEN'],
    api_base_url='https://mastodon.uno'
)

# Create an instance of the SentimentIntensityAnalyzer class
analyzer = SentimentIntensityAnalyzer()

# Create a list to store the data for CSV
csv_data = []

max_id = None  # Initial value for pagination

while True:
    # Get the home timeline posts with pagination
    timeline = mastodon.timeline_home(limit=40, max_id=max_id)

    if not timeline:
        # Break the loop if no more posts are returned
        break

    for status in timeline:
        content = status['content']
        author = status['account']['username']
        timestamp = status['created_at']

        # Use BeautifulSoup to remove HTML tags from content
        soup = BeautifulSoup(content, 'html.parser')
        filtered_content = soup.get_text()

        # Remove URLs from filtered_content
        filtered_content = re.sub(r'http\S+|www\S+', '', filtered_content)

        # Remove only #
        filtered_content = re.sub(r'#', '', filtered_content).lower()

        # Perform sentiment analysis on the filtered content
        sentiment_scores = analyzer.polarity_scores(filtered_content)
        sentiment_score = sentiment_scores['compound']

        # Classify sentiment based on the sentiment score
        if sentiment_score >= 0.05:
            sentiment_label = 'Positive'
        elif sentiment_score <= -0.05:
            sentiment_label = 'Negative'
        else:
            sentiment_label = 'Neutral'

        # Add the data to csv_data as a list
        csv_data.append([filtered_content, sentiment_label])

        # Print the post information and sentiment analysis result
        print(f"Content: {filtered_content}")
        print(f"Sentiment: {sentiment_label} ({sentiment_score})")
        print("---")

        # Update max_id for pagination
        max_id = status['id']

# Generate csv
with open('dataset.csv', 'a', newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(['Content', 'Sentiment'])  # Write the column headers
    writer.writerows(csv_data)

In [None]:
# Display dataset
dataset = panda.read_csv("dataset.csv", encoding='unicode_escape')
dataset

In [69]:
# Clean dataset

# Removes everything that is no a text
text_data = dataset.select_dtypes(include=['object']).applymap(
    lambda x: re.sub(r'[^a-zA-Z]', ' ', str(x)))

# Removes spaces at the beginning
text_data = text_data.applymap(
    lambda x: x.strip() if isinstance(x, str) else x)

# Removes NaNs
text_data = text_data.replace('nan', np.nan).dropna()

# Exports new csv in an other csv
text_data.to_csv("dataset.csv", index=False)

dataset = panda.read_csv("dataset.csv", encoding='unicode_escape')
dataset

Unnamed: 0,Content,Sentiment
0,in bulgaria a demon possessed priest cursed ...,Neutral
1,volodymyr zelenskyi arrived in the czech repub...,Neutral
2,destroyed bts of the armed forces of ukraine...,Negative
3,there is no lack of threads threadsapp in th...,Positive
4,five enemy ammunition depots destroyed in tavr...,Negative
