## Preprocessing the data scraped from Reddit for further topic modelling and sentiment analysis

In [48]:
#Import packages
import pandas as pd
import os
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#Set working directory
cwd = os.getcwd()

In [2]:
#Import the data
df_submission1 = pd.read_csv(cwd + "/data/F1sub_chatgpt.csv")
df_submission2 = pd.read_csv(cwd + "/data/F1sub_subreddits.csv")
df_comment1 = pd.read_csv(cwd+ '/data/F1comments_chatgpt.csv')
df_comment2 = pd.read_csv(cwd+ '/data/F1comments_subreddits.csv')

### Preprocessing of submissions

In [3]:
#Combining the two submission datasets
df_submission = pd.concat([df_submission1, df_submission2], ignore_index=True)

In [5]:
#Removing the empty values
df_submission = df_submission.fillna(' ')
#Removing the \n from the 'Text of URL Post' column
df_submission['Text of URL Post'] = df_submission['Text of URL Post'].str.replace('\n', ' ')
#Removing the links from the 'Post Text' column
df_submission['Post Text'] = df_submission['Post Text'].str.replace("http\S+", ' ', regex=True)
df_submission['Post Text'] = df_submission['Post Text'].str.replace(r'\bLink\b', ' ', regex=True)

#Removing mentions of common words
replacement_post = ["Reddit", "reddit",
                    "ChatGPT", "chatgpt",
                    "chatgpt3", "ChatGPT3", "chatgpt-3", "ChatGPT-3", "GPT3", "gpt3", "GPT-3"
                    "chatgpt3.5", "ChatGPT3.5", "chatgpt-3.5", "ChatGPT-3.5", "GPT3.5", "gpt3.5", "GPT-3.5",
                    "chatgpt4", "ChatGPT4", "chatgpt-4", "ChatGPT-4", "GPT4", "gpt4", "GPT-4",
                    #Following message appeared when the retrieval of the data failed
                    "Sorry, for some reason reddit can't be reached.",
                    #Following messages appeared when cookie policies were encountered
                    "and its partners use cookies and similar technologies to provide you with a better experience.", 
                    "By accepting all cookies, you agree to our use of cookies to deliver and maintain our services and site, improve the quality of , personalize  content and advertising, and measure the effectiveness of advertising.",
                    "By rejecting non-essential cookies,  may still use certain cookies to ensure the proper functionality of our platform.",
                    "For more information, please see our",
                    "Cookie Notice",
                    "and our",
                    "Privacy Policy.",
                    "Sub to discuss about",
                    "Not affiliated with OpenAI."
                ]

for index1, row1 in enumerate(df_submission["Text of URL Post"]):
    for rep1 in replacement_post:
        df_submission.loc[index1, "Text of URL Post"] = df_submission["Text of URL Post"][index1].replace(rep1, "")
        

In [6]:
#Checking if the preprocessing has worked
result1 = df_submission['Post Text'].str.contains('NaN')
result2 = df_submission['Text of URL Post'].str.contains("ChatGPT")
result3 = df_submission['Text of URL Post'].str.contains("\n")
result4 = df_submission['Text of URL Post'].str.contains("Sorry, for some reason reddit can't be reached.")
result5 = df_submission['Text of URL Post'].str.contains("By rejecting non-essential cookies,")
result6 = df_submission['Post Text'].str.contains("http\S+")

print(f'NaN in Post Text: {result1.any()}')
print(f'ChatGPT in Text of URL Post: {result2.any()}')
print(f"\\n in Text of URL Post: {result3.any()}")
print(f"Sorry... in Text of URL Post: {result4.any()}")
print(f'Cookies in Text of URL Post: {result5.any()}')
print(f'Links in Post Text: {result6.any()}')

NaN in Post Text: False
ChatGPT in Text of URL Post: False
\n in Text of URL Post: False
Sorry... in Text of URL Post: False
Cookies in Text of URL Post: False
Links in Post Text: False


In [15]:
#Saving the preprocessed data
df_submission.to_csv('F1post.csv', index=False)

### Preprocessing of comments

In [35]:
#Combining the two submission datasets
df_comment = pd.concat([df_comment1, df_comment2], ignore_index=True)

In [36]:
#Removing the links from the 'Post Text' column
df_comment['Comment Text'] = df_comment['Comment Text'].str.replace("http\S+", ' ', regex=True)

#Removing every row where the text in "Comment Text" starts with >, as these are replies to other comments, not to the post
df_comment.loc[df_comment['Comment Text'].astype(str).str.startswith(">"), 'Comment Text'] = ' '
df_comment = df_comment[~(df_comment['Comment Text'].astype(str).str.strip() == ' ')]

#Removing mentions of common words
replacement_comment = ["Reddit", "reddit",
                    "ChatGPT", "chatgpt",
                    "chatgpt3", "ChatGPT3", "chatgpt-3", "ChatGPT-3", "GPT3", "gpt3", "GPT-3"
                    "chatgpt3.5", "ChatGPT3.5", "chatgpt-3.5", "ChatGPT-3.5", "GPT3.5", "gpt3.5", "GPT-3.5",
                    "chatgpt4", "ChatGPT4", "chatgpt-4", "ChatGPT-4", "GPT4", "gpt4", "GPT-4", "GPT 3.5", "gpt 3.5", 
                    r"\n", r"\n\n", r"\n\n\n", r"\n\n\n\n", 
                    "[verwijderd]" #appeared when a comment was removed
                ]

for index2, row2 in enumerate(tqdm.tqdm(df_comment["Comment Text"], desc="Replacing common words in comments", total=len(df_comment))):
    for rep2 in replacement_comment:
        try:
            df_comment.at[index2, 'Comment Text'] = df_comment['Comment Text'][index2].replace(rep2, ' ')
        except AttributeError:
            print(f"Error: Cannot replace {rep2} in row {index2}. Skipping...") #For some rows the comment text cant be replaced



Replacing common words in comments:  31%|███       | 100036/323799 [01:25<03:07, 1192.01it/s]

Error: Cannot replace Reddit in row 99758. Skipping...
Error: Cannot replace reddit in row 99758. Skipping...
Error: Cannot replace ChatGPT in row 99758. Skipping...
Error: Cannot replace chatgpt in row 99758. Skipping...
Error: Cannot replace chatgpt3 in row 99758. Skipping...
Error: Cannot replace ChatGPT3 in row 99758. Skipping...
Error: Cannot replace chatgpt-3 in row 99758. Skipping...
Error: Cannot replace ChatGPT-3 in row 99758. Skipping...
Error: Cannot replace GPT3 in row 99758. Skipping...
Error: Cannot replace gpt3 in row 99758. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 99758. Skipping...
Error: Cannot replace ChatGPT3.5 in row 99758. Skipping...
Error: Cannot replace chatgpt-3.5 in row 99758. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 99758. Skipping...
Error: Cannot replace GPT3.5 in row 99758. Skipping...
Error: Cannot replace gpt3.5 in row 99758. Skipping...
Error: Cannot replace GPT-3.5 in row 99758. Skipping...
Error: Cannot replace chatgpt4 in

Replacing common words in comments:  31%|███       | 101125/323799 [01:26<03:06, 1195.30it/s]

Error: Cannot replace Reddit in row 100872. Skipping...
Error: Cannot replace reddit in row 100872. Skipping...
Error: Cannot replace ChatGPT in row 100872. Skipping...
Error: Cannot replace chatgpt in row 100872. Skipping...
Error: Cannot replace chatgpt3 in row 100872. Skipping...
Error: Cannot replace ChatGPT3 in row 100872. Skipping...
Error: Cannot replace chatgpt-3 in row 100872. Skipping...
Error: Cannot replace ChatGPT-3 in row 100872. Skipping...
Error: Cannot replace GPT3 in row 100872. Skipping...
Error: Cannot replace gpt3 in row 100872. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 100872. Skipping...
Error: Cannot replace ChatGPT3.5 in row 100872. Skipping...
Error: Cannot replace chatgpt-3.5 in row 100872. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 100872. Skipping...
Error: Cannot replace GPT3.5 in row 100872. Skipping...
Error: Cannot replace gpt3.5 in row 100872. Skipping...
Error: Cannot replace GPT-3.5 in row 100872. Skipping...
Error: Cannot re

Replacing common words in comments:  32%|███▏      | 104885/323799 [01:29<03:00, 1210.29it/s]

Error: Cannot replace Reddit in row 104552. Skipping...
Error: Cannot replace reddit in row 104552. Skipping...
Error: Cannot replace ChatGPT in row 104552. Skipping...
Error: Cannot replace chatgpt in row 104552. Skipping...
Error: Cannot replace chatgpt3 in row 104552. Skipping...
Error: Cannot replace ChatGPT3 in row 104552. Skipping...
Error: Cannot replace chatgpt-3 in row 104552. Skipping...
Error: Cannot replace ChatGPT-3 in row 104552. Skipping...
Error: Cannot replace GPT3 in row 104552. Skipping...
Error: Cannot replace gpt3 in row 104552. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 104552. Skipping...
Error: Cannot replace ChatGPT3.5 in row 104552. Skipping...
Error: Cannot replace chatgpt-3.5 in row 104552. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 104552. Skipping...
Error: Cannot replace GPT3.5 in row 104552. Skipping...
Error: Cannot replace gpt3.5 in row 104552. Skipping...
Error: Cannot replace GPT-3.5 in row 104552. Skipping...
Error: Cannot re

Replacing common words in comments:  46%|████▌     | 148178/323799 [02:05<02:25, 1209.59it/s]

Error: Cannot replace Reddit in row 147867. Skipping...
Error: Cannot replace reddit in row 147867. Skipping...
Error: Cannot replace ChatGPT in row 147867. Skipping...
Error: Cannot replace chatgpt in row 147867. Skipping...
Error: Cannot replace chatgpt3 in row 147867. Skipping...
Error: Cannot replace ChatGPT3 in row 147867. Skipping...
Error: Cannot replace chatgpt-3 in row 147867. Skipping...
Error: Cannot replace ChatGPT-3 in row 147867. Skipping...
Error: Cannot replace GPT3 in row 147867. Skipping...
Error: Cannot replace gpt3 in row 147867. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 147867. Skipping...
Error: Cannot replace ChatGPT3.5 in row 147867. Skipping...
Error: Cannot replace chatgpt-3.5 in row 147867. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 147867. Skipping...
Error: Cannot replace GPT3.5 in row 147867. Skipping...
Error: Cannot replace gpt3.5 in row 147867. Skipping...
Error: Cannot replace GPT-3.5 in row 147867. Skipping...
Error: Cannot re

Replacing common words in comments:  79%|███████▉  | 255002/323799 [03:36<00:57, 1192.92it/s]

Error: Cannot replace Reddit in row 254854. Skipping...
Error: Cannot replace reddit in row 254854. Skipping...
Error: Cannot replace ChatGPT in row 254854. Skipping...
Error: Cannot replace chatgpt in row 254854. Skipping...
Error: Cannot replace chatgpt3 in row 254854. Skipping...
Error: Cannot replace ChatGPT3 in row 254854. Skipping...
Error: Cannot replace chatgpt-3 in row 254854. Skipping...
Error: Cannot replace ChatGPT-3 in row 254854. Skipping...
Error: Cannot replace GPT3 in row 254854. Skipping...
Error: Cannot replace gpt3 in row 254854. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 254854. Skipping...
Error: Cannot replace ChatGPT3.5 in row 254854. Skipping...
Error: Cannot replace chatgpt-3.5 in row 254854. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 254854. Skipping...
Error: Cannot replace GPT3.5 in row 254854. Skipping...
Error: Cannot replace gpt3.5 in row 254854. Skipping...
Error: Cannot replace GPT-3.5 in row 254854. Skipping...
Error: Cannot re

Replacing common words in comments:  81%|████████  | 261936/323799 [03:42<00:59, 1042.33it/s]

Error: Cannot replace Reddit in row 261743. Skipping...
Error: Cannot replace reddit in row 261743. Skipping...
Error: Cannot replace ChatGPT in row 261743. Skipping...
Error: Cannot replace chatgpt in row 261743. Skipping...
Error: Cannot replace chatgpt3 in row 261743. Skipping...
Error: Cannot replace ChatGPT3 in row 261743. Skipping...
Error: Cannot replace chatgpt-3 in row 261743. Skipping...
Error: Cannot replace ChatGPT-3 in row 261743. Skipping...
Error: Cannot replace GPT3 in row 261743. Skipping...
Error: Cannot replace gpt3 in row 261743. Skipping...
Error: Cannot replace GPT-3chatgpt3.5 in row 261743. Skipping...
Error: Cannot replace ChatGPT3.5 in row 261743. Skipping...
Error: Cannot replace chatgpt-3.5 in row 261743. Skipping...
Error: Cannot replace ChatGPT-3.5 in row 261743. Skipping...
Error: Cannot replace GPT3.5 in row 261743. Skipping...
Error: Cannot replace gpt3.5 in row 261743. Skipping...
Error: Cannot replace GPT-3.5 in row 261743. Skipping...
Error: Cannot re

Replacing common words in comments: 100%|██████████| 323799/323799 [04:35<00:00, 1177.14it/s]


In [22]:
df_comment.loc[df_comment['Comment Text'].astype(str).str.startswith(">"), 'Comment Text'] = ' '

In [23]:
# Count the initial number of rows
initial_count = len(df_comment)

# Remove rows consisting only of spaces
df_comment = df_comment[~(df_comment['Comment Text'].astype(str).str.strip() == '')]

# Count the final number of rows
final_count = len(df_comment)

# Calculate the count of removed rows
removed_count = initial_count - final_count

# Print the updated DataFrame
print(df_comment)

# Print the count of removed rows
print(f"Removed {removed_count} rows.")

       Subreddit ID of Post   
0        ChatGPT    12cobqr  \
1        ChatGPT    12cobqr   
2        ChatGPT    12cobqr   
3        ChatGPT    12cobqr   
4        ChatGPT    12cobqr   
...          ...        ...   
323794       ufc    11i3nxy   
323795       ufc    11i3nxy   
323796       ufc    11i3nxy   
323797       ufc    11i3nxy   
323798       ufc    11i3nxy   

                                             Comment Text  Score   
0         Recognizing ASCII shrek is the last step to AGI   2180  \
1                              That's GPT 3.5, green icon    644   
2                                   It’s not GPT-4 though    358   
3                                        This is not GPT4    526   
4       Can someone in the comments please do this aga...    138   
...                                                   ...    ...   
323794  And he's not just a tall lanky can like James ...      1   
323795              Easiest match Jones ever had tonight?      5   
323796  Remembe

In [37]:
#Checking if the preprocessing has worked
result7 = df_comment['Comment Text'].str.contains("ChatGPT")
result8 = df_comment['Comment Text'].str.contains("\n")
result9 = df_comment['Comment Text'].str.startswith('>')

print(f'ChatGPT in Comment Text: {result7.any()}')
print(f"\\n in Comment Text: {result8.any()}") #The technique was not able to remove all
print(f'Comment Text starts with >: {result9.any()}')

ChatGPT in Comment Text: False
\n in Comment Text: True
Comment Text starts with >: False


In [40]:
#Checking if the preprocessing has wroked
result9 = df_comment['Comment Text'].str.contains('[verwijderd]')
print(f"[verwijderd] in Comment Text: {result9.any()}\n")

#This presents that the preprocessing of [verwijderd] has not worked
#However, it did:
df_comment_test = pd.read_csv(cwd+ '/data/F1comments_chatgpt.csv')
print(f'The original dataset:\n{df_comment_test["Comment Text"][:9]}\n')
print(f'The preprocessed dataset:\n{df_comment["Comment Text"][:9]}')

[verwijderd] in Comment Text: True

The original dataset:
0      Recognizing ASCII shrek is the last step to AGI
1                           That's GPT 3.5, green icon
2                                It’s not GPT-4 though
3                                     This is not GPT4
4    Can someone in the comments please do this aga...
5    GPT4 thinks otherwise\n\nhttps://preview.redd....
6                 Famous: yes\n\nMona Lisa: absolutely
7                                         [verwijderd]
8    I think you discovered a secret way for humans...
Name: Comment Text, dtype: object

The preprocessed dataset:
0      Recognizing ASCII shrek is the last step to AGI
1                                 That's  , green icon
2                                    It’s not   though
3                                        This is not  
4    Can someone in the comments please do this aga...
5                                thinks otherwise\n\n 
6                 Famous: yes\n\nMona Lisa: absolutely
7

In [60]:
#Saving the preprocessed data
df_comment.to_csv(cwd+'/data/Comment_dataset_processed.csv', index=False)