In [None]:
# Import necessary libraries
import pandas
import praw
from dotenv import dotenv_values, load_dotenv
import os
import nltk
from nltk import word_tokenize
import re
import numpy as np
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')


# Load environment variables from .env file
env = dotenv_values(".env")

# Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=env["CLIENT_ID"],
    client_secret=env["CLIENT_SECRET"],
    user_agent=env["USER_AGENT"],
    redirect_uri=env["REDIRECT_URI"],
    refresh_token=env["REFRESH_TOKEN"],
)

# Check if the CSV file already exists
csv_file_name = "reddit_posts_with_comments.csv"
if os.path.exists(csv_file_name):
    print("CSV file already exists. Appending new data and avoiding duplicates.")
    df = pandas.read_csv(csv_file_name)  # Read existing CSV into a DataFrame
else:
    print("CSV file does not exist. It will be created after fetching new data.")
    df = pandas.DataFrame(columns=["Title", "Id", "Upvotes", "Comments"])

# Create a subreddit instance
targetObjects = ['conspiracy',
                 'WhitePeopleTwitter', 'politics', 'Republican', 'worldnews']
for subreddit_name in targetObjects:
    subreddit = reddit.subreddit(subreddit_name)

    # Print subreddit name
    print(subreddit.display_name)

    # Lists to store submission information
    titles = []
    scores = []
    ids = []
    comments = []

    # Loop through the newest 21 submissions in the subreddit
    for iteration, submission in enumerate(subreddit.hot(limit=5)):
        print(f"post {iteration}/5")
        # Check if the submission ID already exists in the DataFrame to avoid duplication
        if submission.id not in df["Id"].values:
            # Add submission title to the titles list
            titles.append(submission.title)
            scores.append(submission.score)  # Add upvotes to the scores list
            ids.append(submission.id)  # Add submission ID to the ids list

            # Fetch comments for the current submission
            submission.comments.replace_more(limit=25)
            submission_comments = []
            for comment in submission.comments.list():
                # Check if the comment author's username contains "bot"
                if 'bot' not in comment.name:
                    # Use BeautifulSoup to remove HTML tags from content
                    soup = BeautifulSoup(comment.body, 'lxml')
                    filtered_content = soup.get_text()

                    # Remove URLs from filtered_content
                    filtered_content = re.sub(
                        r'http\S+|www\S+', '', filtered_content)

                    # Remove only #
                    filtered_content = re.sub(r'#', '', filtered_content).lower()
                    submission_comments.append(filtered_content)
            comments.append(submission_comments)

        # Create a DataFrame with the new data
        new_data = pandas.DataFrame(
            {"Title": titles, "Id": ids, "Upvotes": scores, "Comments": comments}
        )

        # Append/concat the new data to the existing DataFrame
        df = pandas.concat([df, new_data], ignore_index=True)

        # Drop duplicates based on the 'Id' column (submission IDs)
        df.drop_duplicates(subset="Id", keep="last", inplace=True)
    # Save the DataFrame to the CSV file
df.to_csv(csv_file_name, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df.shape)
print(df.head(10))

print(f"CSV file '{csv_file_name}' has been generated/updated with the new Reddit posts and comments while avoiding duplicates.")



In [1]:
import pandas as pd
import re
import numpy as np
from nltk import word_tokenize
import nltk

# Load the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

csv_tokenized = "tokenized_csv.csv"
csv_input = "reddit_posts_with_comments.csv"
df = pd.read_csv(csv_input)

df_cleaned = df.copy()
df_cleaned = df_cleaned.select_dtypes(include=['object']).applymap(
    lambda x: re.sub(r'[^a-zA-Z]', ' ', str(x)))
df_cleaned = df_cleaned.applymap(
    lambda x: x.strip() if isinstance(x, str) else x)
df_cleaned = df_cleaned.replace('nan', np.nan).dropna()

# Tokenize the text data
df_cleaned['tokenized_text'] = df_cleaned['Comments'].apply(
    lambda x: word_tokenize(str(x)))

# Removal of stopwords
stopwords_english = set(nltk.corpus.stopwords.words("english"))
df_cleaned['tokenized_text'] = df_cleaned['tokenized_text'].apply(lambda tokens: [
    token for token in tokens if token.lower() not in stopwords_english ])

# Remove tokens with a single character
df_cleaned['tokenized_text'] = df_cleaned['tokenized_text'].apply(lambda tokens: [
    token for token in tokens if len(token) > 1])

# Drop the unnecessary columns (keep only the 'tokenized_text' column)
df_cleaned = df_cleaned[['tokenized_text']]

# Save the cleaned DataFrame to the CSV file
df_cleaned.to_csv(csv_tokenized, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df_cleaned.shape)
print(df_cleaned.head(10))

print(
    f"CSV file '{csv_tokenized}' has been generated/updated with the tokenized text while avoiding duplicates and cleaning the data."
)


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [8]:
import pandas as pd
from hatesonar import Sonar

# Load the dataset from the CSV file
csv_file = "reddit_posts_with_comments.csv"
df = pd.read_csv(csv_file)

# Initialize the Sonar hate speech analyzer
sonar = Sonar()

# List to store the results of hate speech analysis for each comment
results = []

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    comments = eval(row["Comments"])  # Assuming the comments column is a string representation of a list, convert it back to a list using eval()
    
    for comment in comments:
        result = sonar.ping(text=comment)
        results.append(result)

# Optionally, you can add the results to a new DataFrame if you want to analyze them further
results_df = pd.DataFrame(results)

# Print the results DataFrame
print(results_df)

# Optionally, you can save the results DataFrame to a new CSV file
results_csv_file = "results_hate_speech_analysis.csv"
results_df.to_csv(results_csv_file, index=False)


ModuleNotFoundError: No module named 'sklearn.linear_model.logistic'

: 