In [13]:
# Import necessary libraries
import praw
from dotenv import dotenv_values, load_dotenv
import os
import pandas
import nltk
from nltk import word_tokenize
import re
import numpy as np
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')


# Load environment variables from .env file
env = dotenv_values(".env")

# Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=env["CLIENT_ID"],
    client_secret=env["CLIENT_SECRET"],
    user_agent=env["USER_AGENT"],
    redirect_uri=env["REDIRECT_URI"],
    refresh_token=env["REFRESH_TOKEN"],
)

# Check if the CSV file already exists
csv_file_name = "reddit_posts_with_comments.csv"
if os.path.exists(csv_file_name):
    print("CSV file already exists. Appending new data and avoiding duplicates.")
    df = pandas.read_csv(csv_file_name)  # Read existing CSV into a DataFrame
else:
    print("CSV file does not exist. It will be created after fetching new data.")
    df = pandas.DataFrame(columns=["Title", "Id", "Upvotes", "Comments"])

# Create a subreddit instance
targetObjects = ['conspiracy', 'disney',
                 'WhitePeopleonTwitter', 'politics', 'Republican', 'worldnews']
for subreddit_name in targetObjects:
    subreddit = reddit.subreddit(subreddit_name)

    # Print subreddit name
    print(subreddit.display_name)

    # Lists to store submission information
    titles = []
    scores = []
    ids = []
    comments = []

    # Loop through the newest 21 submissions in the subreddit
    for iteration, submission in enumerate(subreddit.hot(limit=25)):
        print(f"post {iteration}/25")
        # Check if the submission ID already exists in the DataFrame to avoid duplication
        if submission.id not in df["Id"].values:
            # Add submission title to the titles list
            titles.append(submission.title)
            scores.append(submission.score)  # Add upvotes to the scores list
            ids.append(submission.id)  # Add submission ID to the ids list

            # Fetch comments for the current submission
            submission.comments.replace_more(limit=25)
            submission_comments = []
            for comment in submission.comments.list():
                # Check if the comment author's username contains "bot"
                if 'bot' not in comment.name:
                    # Use BeautifulSoup to remove HTML tags from content
                    soup = BeautifulSoup(comment.body, 'html.parser')
                    filtered_content = soup.get_text()

                    # Remove URLs from filtered_content
                    filtered_content = re.sub(
                        r'http\S+|www\S+', '', filtered_content)

                    # Remove only #
                    filtered_content = re.sub(r'#', '', filtered_content).lower()
                    submission_comments.append(filtered_content)
            comments.append(submission_comments)

        # Create a DataFrame with the new data
        new_data = pandas.DataFrame(
            {"Title": titles, "Id": ids, "Upvotes": scores, "Comments": comments}
        )

        # Append/concat the new data to the existing DataFrame
        df = pandas.concat([df, new_data], ignore_index=True)

        # Drop duplicates based on the 'Id' column (submission IDs)
        df.drop_duplicates(subset="Id", keep="last", inplace=True)

# Clean dataset
df_cleaned = df.copy()
df_cleaned = df_cleaned.select_dtypes(include=['object']).applymap(
    lambda x: re.sub(r'[^a-zA-Z]', ' ', str(x)))
df_cleaned = df_cleaned.applymap(
    lambda x: x.strip() if isinstance(x, str) else x)
df_cleaned = df_cleaned.replace('nan', np.nan).dropna()

# Tokenize the text data
df_cleaned['tokenized_text'] = df_cleaned['Comments'].apply(
    lambda x: word_tokenize(str(x)))

# Removal of stopwords
stopwords_english = set(nltk.corpus.stopwords.words("english"))
stopwords_italian = set(nltk.corpus.stopwords.words("italian"))
df_cleaned['tokenized_text'] = df_cleaned['tokenized_text'].apply(lambda tokens: [
    token for token in tokens if token.lower() not in stopwords_english and token.lower() not in stopwords_italian])

# Save the cleaned DataFrame to the CSV file
df_cleaned.to_csv(csv_file_name, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df_cleaned.shape)
print(df_cleaned.head(10))

print(
    f"CSV file '{csv_file_name}' has been generated/updated with the new Reddit posts and comments while avoiding duplicates and cleaning the text data."
)

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CSV file does not exist. It will be created after fetching new data.
conspiracy
post 0/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 1/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 2/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 3/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 4/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 5/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 6/25
post 7/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 8/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 9/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 10/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 11/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 12/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 13/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 14/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 15/25
post 16/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 17/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 18/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 19/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 20/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 21/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 22/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 23/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 24/25


  soup = BeautifulSoup(comment.body, 'html.parser')


disney
post 0/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 1/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 2/25
post 3/25
post 4/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 5/25
post 6/25
post 7/25
post 8/25
post 9/25
post 10/25
post 11/25
post 12/25
post 13/25
post 14/25
post 15/25
post 16/25
post 17/25
post 18/25
post 19/25
post 20/25
post 21/25
post 22/25
post 23/25
post 24/25
WhitePeopleonTwitter
politics
post 0/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 1/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 2/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 3/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 4/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 5/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 6/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 7/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 8/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 9/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 10/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 11/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 12/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 13/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 14/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 15/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 16/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 17/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 18/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 19/25
post 20/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 21/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 22/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 23/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 24/25


  soup = BeautifulSoup(comment.body, 'html.parser')


Republican
post 0/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 1/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 2/25
post 3/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 4/25
post 5/25
post 6/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 7/25
post 8/25
post 9/25
post 10/25
post 11/25
post 12/25
post 13/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 14/25
post 15/25
post 16/25
post 17/25
post 18/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 19/25
post 20/25
post 21/25
post 22/25
post 23/25
post 24/25


  soup = BeautifulSoup(comment.body, 'html.parser')


worldnews
post 0/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 1/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 2/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 3/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 4/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 5/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 6/25
post 7/25
post 8/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 9/25
post 10/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 11/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 12/25
post 13/25
post 14/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 15/25
post 16/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 17/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 18/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


post 19/25
post 20/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 21/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 22/25
post 23/25


  soup = BeautifulSoup(comment.body, 'html.parser')


post 24/25


  soup = BeautifulSoup(comment.body, 'html.parser')
  soup = BeautifulSoup(comment.body, 'html.parser')


(125, 5)
                                               Title     Id Upvotes  \
0                          Anything but the DEATHVAX    bwo           
1  The US sent      billion dollars in aid to Ukr...   c fc           
2  showing his Russian work visa off on Twitter  ...  c ypx           
3  World Cup star      grabs her chest and collap...  c kow           
4                         The Jussie Smollett Filter   bx u           
5  Dont forget about these    deaths Travis Scott...  c itt           
6  Biden Administration denies Secret Service pro...  by  j           
7  Incredibly fishy news article today about   ma...  brhly           
8  They told us they found  Aliens  before they f...   ba d           
9  Dear Reddit  All information must be debated  ...  bxbgu           

                                            Comments  \
0  meta  sticky comment n n rule        does not ...   
1  meta  sticky comment n n rule        does not ...   
2  meta  sticky comment n n rule        d