In [None]:
# Import necessary libraries
import pandas
import praw
from dotenv import dotenv_values, load_dotenv
import os
import nltk
from nltk import word_tokenize
import re
import numpy as np
from bs4 import BeautifulSoup

#nltk.download('punkt')
#nltk.download('stopwords')


# Load environment variables from .env file
env = dotenv_values(".env")

# Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=env["CLIENT_ID"],
    client_secret=env["CLIENT_SECRET"],
    user_agent=env["USER_AGENT"],
    redirect_uri=env["REDIRECT_URI"],
    refresh_token=env["REFRESH_TOKEN"],
)

# Check if the CSV file already exists
csv_file_name = "reddit_posts_with_comments.csv"
if os.path.exists(csv_file_name):
    print("CSV file already exists. Appending new data and avoiding duplicates.")
    df = pandas.read_csv(csv_file_name)  # Read existing CSV into a DataFrame
else:
    print("CSV file does not exist. It will be created after fetching new data.")
    df = pandas.DataFrame(columns=["Title", "Id", "Upvotes", "Comments"])

# Create a subreddit instance
targetObjects = ['conspiracy',
                 'WhitePeopleTwitter', 'politics', 'Republican', 'worldnews']
for subreddit_name in targetObjects:
    subreddit = reddit.subreddit(subreddit_name)

    # Print subreddit name
    print(subreddit.display_name)

    # Lists to store submission information
    titles = []
    scores = []
    ids = []
    comments = []

    # Loop through the newest 21 submissions in the subreddit
    for iteration, submission in enumerate(subreddit.hot(limit=5)):
        print(f"post {iteration}/5")
        # Check if the submission ID already exists in the DataFrame to avoid duplication
        if submission.id not in df["Id"].values:
            # Add submission title to the titles list
            titles.append(submission.title)
            scores.append(submission.score)  # Add upvotes to the scores list
            ids.append(submission.id)  # Add submission ID to the ids list

            # Fetch comments for the current submission
            submission.comments.replace_more(limit=25)
            submission_comments = []
            for comment in submission.comments.list():
                # Check if the comment author's username contains "bot"
                if 'bot' not in comment.name:
                    # Use BeautifulSoup to remove HTML tags from content
                    soup = BeautifulSoup(comment.body, 'lxml')
                    filtered_content = soup.get_text()

                    # Remove URLs from filtered_content
                    filtered_content = re.sub(
                        r'http\S+|www\S+', '', filtered_content)

                    # Remove only #
                    filtered_content = re.sub(r'#', '', filtered_content).lower()
                    submission_comments.append(filtered_content)
            comments.append(submission_comments)

        # Create a DataFrame with the new data
        new_data = pandas.DataFrame(
            {"Title": titles, "Id": ids, "Upvotes": scores, "Comments": comments}
        )

        # Append/concat the new data to the existing DataFrame
        df = pandas.concat([df, new_data], ignore_index=True)

        # Drop duplicates based on the 'Id' column (submission IDs)
        df.drop_duplicates(subset="Id", keep="last", inplace=True)
    # Save the DataFrame to the CSV file
df.to_csv(csv_file_name, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df.shape)
print(df.head(10))

print(f"CSV file '{csv_file_name}' has been generated/updated with the new Reddit posts and comments while avoiding duplicates.")



In [None]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk import word_tokenize

csv_tokenized = "tokenized_csv.csv"
csv_input = "reddit_posts_with_comments.csv"
df = pd.read_csv(csv_input)

# Function to clean the text using regex
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text

# Clean the 'Comments' column
df['Comments'] = df['Comments'].apply(clean_text)

# Tokenize the text data
df['tokenized_text'] = df['Comments'].apply(word_tokenize)

# Removal of stopwords
stopwords_english = set(nltk.corpus.stopwords.words("english"))
df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_english])

# Remove tokens with a single character
df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [token for token in tokens if len(token) > 1])

# Drop the unnecessary columns (keep only the 'tokenized_text' column)
df_cleaned = df[['tokenized_text']]

# Save the cleaned DataFrame to the CSV file
df_cleaned.to_csv(csv_tokenized, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df_cleaned.shape)
print(df_cleaned.head(10))

print(
    f"CSV file '{csv_tokenized}' has been generated/updated with the tokenized text while avoiding duplicates and cleaning the data."
)


In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from langdetect import detect
from tqdm import tqdm
import concurrent.futures
import requests
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Load the Reddit posts CSV file
input_csv = "reddit_posts_with_comments.csv"
output_csv = "reddit_posts_with_labels.csv"

# Define a mapping of languages to model names
language_to_model = {
    'en': "IMSyPP/hate_speech_en",
    'it': "IMSyPP/hate_speech_it",
    'nl': "IMSyPP/hate_speech_nl",
    'sl': "IMSyPP/hate_speech_slo",
}

# Define the default model for cases where language detection fails
default_model_name = "IMSyPP/hate_speech_en"

# Function to load the appropriate model based on the language
def load_model(language):
    model_name = language_to_model.get(language, default_model_name)

    # Specify the cache directory for local caching
    cache_dir = ".cache"

    # Use a retry mechanism with a timeout
    retries = 3
    timeout = 30  # seconds

    # Retry mechanism to handle timeout issues
    for _ in range(retries):
        try:
            # Load the tokenizer and model
            logger.info(f"Downloading and caching model '{model_name}'...")
            tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, timeout=timeout)
            model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=cache_dir, timeout=timeout)
            logger.info(f"Model '{model_name}' downloaded and cached successfully.")
            return tokenizer, model
        except requests.exceptions.Timeout:
            logger.warning("Request timed out, retrying model download...")
            continue
        except Exception as e:
            logger.error(f"An error occurred while downloading the model: {e}")
            break
    else:
        logger.error(f"Failed to load model '{model_name}' after {retries} retries.")
        return None, None

# Function to analyze a comment and return the results
def analyze_comment(comment, language, tokenizer, model):
    # Tokenize the comment
    inputs = tokenizer(comment, return_tensors="pt")

    # Model inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the probabilities for each class (non-hate speech, hate speech, offensive speech, violent speech)
    probabilities = torch.softmax(outputs.logits, dim=1).tolist()[0]

    return probabilities, language

# Read the CSV file
df = pd.read_csv(input_csv)

# Total number of comments to process
total_comments = df['Comments'].apply(len).sum()

# Batch size for writing results to CSV
batch_size = 100

# Function to process a batch of comments and store the results in the result lists
def process_batch(comment_list, tokenizer, model):
    result_comments = []
    result_probabilities_hate = []
    result_probabilities_offensive = []
    result_probabilities_violent = []
    result_languages = []

    for comment in comment_list:
        try:
            # Detect the language of the comment
            language = detect(comment)
        except:
            # Handle language detection errors by using the default model
            language = 'en'

        # Analyze the comment using the appropriate model
        probabilities, language = analyze_comment(comment, language, tokenizer, model)

        # Store the results
        result_comments.append(comment)
        result_probabilities_hate.append(probabilities[1])
        result_probabilities_offensive.append(probabilities[2])
        result_probabilities_violent.append(probabilities[3])
        result_languages.append(language)

    return result_comments, result_probabilities_hate, result_probabilities_offensive, result_probabilities_violent, result_languages

# Initialize lists to store final results
final_result_comments = []
final_result_probabilities_hate = []
final_result_probabilities_offensive = []
final_result_probabilities_violent = []
final_result_languages = []

# Processed comment count
processed_comments = 0

# Iterate over rows in the CSV
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Comments"):
    comment_list = row['Comments']

    # Split the comment list into batches
    comment_batches = [comment_list[i:i + batch_size] for i in range(0, len(comment_list), batch_size)]

    # Load the tokenizer and model
    tokenizer, model = load_model('en')  # Default model

    # Process batches concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(process_batch, batch, tokenizer, model) for batch in comment_batches]
        for future in concurrent.futures.as_completed(futures):
            try:
                # Get the results from each batch
                result_comments, result_probabilities_hate, result_probabilities_offensive, result_probabilities_violent, result_languages = future.result()

                # Append the results to the final lists
                final_result_comments.extend(result_comments)
                final_result_probabilities_hate.extend(result_probabilities_hate)
                final_result_probabilities_offensive.extend(result_probabilities_offensive)
                final_result_probabilities_violent.extend(result_probabilities_violent)
                final_result_languages.extend(result_languages)

                # Update processed comment count
                processed_comments += len(result_comments)

                # Log progress
                progress_percent = (processed_comments / total_comments) * 100
                logger.info(f"Processed {processed_comments}/{total_comments} comments ({progress_percent:.2f}%)")
            except Exception as e:
                # Log errors and continue processing other batches
                logger.error(f"An error occurred while processing a batch: {e}")

# Create a new DataFrame for the final results
result_df = pd.DataFrame({
    'comment': final_result_comments,
    'probabilities_hate': final_result_probabilities_hate,
    'probabilities_offensive': final_result_probabilities_offensive,
    'probabilities_violent': final_result_probabilities_violent,
    'language': final_result_languages
})

# Save the results to a new CSV file
result_df.to_csv(output_csv, index=False)

logger.info("Analysis completed. Results saved to: %s", output_csv)
