In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [None]:
!pwd

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# File path of the JSON file
file_path_post = 'drive/My Drive/Projects/LSE-Symptomatology/Reddit Raw Data/PCOS_submissions'
file_path_comment = 'drive/My Drive/Projects/LSE-Symptomatology/Reddit Raw Data/PCOS_comments'

In [None]:
# Load the JSON data from the file
data_submission = []

with open(file_path_post, 'r') as file:
    for line in file:
        data_submission.append(json.loads(line))

In [None]:
data_comments=[]
with open(file_path_comment, 'r') as file:
    for line in file:
        data_comments.append(json.loads(line))

In [None]:
# Convert the list of dictionaries to a DataFrame
df_submission= pd.DataFrame(data_submission)
df_comments= pd.DataFrame(data_comments)

In [None]:
#convert to pandas datatime
df_submission['created_utc'] = pd.to_datetime(df_submission['created_utc'], unit='s')
df_comments['created_utc'] = pd.to_datetime(df_comments['created_utc'], unit='s')

  df_submission['created_utc'] = pd.to_datetime(df_submission['created_utc'], unit='s')
  df_comments['created_utc'] = pd.to_datetime(df_comments['created_utc'], unit='s')


In [None]:
#selected features from raw data
df_submission=df_submission[['id','author','created_utc','link_flair_text','title','selftext','num_comments','score','ups','downs']]
df_comments=df_comments[['id','author','created_utc','body','parent_id','link_id']]

In [None]:
#renaming feature columns
df_submission.rename(columns={
    'id': 'postID',
    'author': 'authorID',
    'created_utc': 'date',
    'link_flair_text': 'flairName',
    'title': 'title',
    'selftext': 'post',
    'num_comments': 'num_comments',
    'score': 'score',
    'ups': 'upvote',
    'downs': 'downvote'
}, inplace=True)

df_comments.rename(columns={
    'id': 'commentID',
    'author': 'authorID',
    'created_utc': 'date',
    'body': 'comment',
    'parent_id': 'parentID',
    'link_id': 'linkID',
}, inplace=True)

In [None]:
df_submission.tail()

Unnamed: 0,postID,authorID,date,flairName,title,post,num_comments,score,upvote,downvote
81090,1006k7h,Comfortable-Cat-4323,2022-12-31 23:30:45,General Health,Is this what insulin resistance feels like?,"I have not been diagnosed with PCOS, but I hav...",2,2,,
81091,1006s76,Existing-Cherry4948,2022-12-31 23:42:26,General/Advice,First Gyno Consult. Advice,My appointment with the gyno is long overdue l...,5,4,,
81092,1006vwo,Guilty-Chipmunk2939,2022-12-31 23:48:02,General/Advice,adrenal pcos,"i’ve recently been diagnosed with pcos, but i ...",5,8,,
81093,1006w1s,Longlastingsorrow,2022-12-31 23:48:14,Diet - Not Keto,Bad appetite during period,[removed],0,1,,
81094,1006yru,[deleted],2022-12-31 23:52:15,Research/Survey,Amh 7.1,[deleted],2,2,,


In [None]:
df_comments.head()

Unnamed: 0,commentID,authorID,date,comment,parentID,linkID
0,c0uubmc,improbablywrong,2010-07-21 22:15:28,"I've tried several items on this list, like Me...",t3_cs80y,t3_cs80y
1,c0ux9q2,emmster,2010-07-22 20:41:36,"I eat a generally lower carb diet (That's ""die...",t3_cs80y,t3_cs80y
2,c0uxcw1,vixiera,2010-07-22 21:13:36,"I self-diagnosed myself a few years ago, but I...",t3_cs80y,t3_cs80y
3,c0uxxpr,improbablywrong,2010-07-23 00:55:28,"I haven't visited their forum for a while, but...",t3_csm11,t3_csm11
4,c0uy2d1,Amberkins72187,2010-07-23 01:48:48,I have had that site bookmarked forever. I hav...,t3_csm11,t3_csm11


In [None]:
len(df_submission), len(df_comments)

(81095, 675577)

**Steps: Process Posts**

In [None]:
# (1) Delete rows with both "u/[deleted]" in "Username" and either "[deleted]" or "[removed]" in "Post_Content" from submission
overlap_rows_indices = df_submission[(df_submission['authorID'] == "[deleted]") & (df_submission['post'].isin(["[deleted]", "[removed]"]))].index
df_submission = df_submission.drop(overlap_rows_indices)

In [None]:
len(df_submission)

64136

In [None]:

# (2) Count and then delete rows in "post" with fewer than five words
short_posts_indices = df_submission[df_submission['post'].apply(lambda x: len(str(x).split()) <= 5)].index
df_submission = df_submission.drop(short_posts_indices)



In [None]:
len(df_submission)

60775

In [None]:
# Define personal experience keywords
personal_experience_keywords = ["I", "my", "me", "mine", "myself", "I've", "I'm", "I'd"]

# Define a function to classify posts
def classify_post(text):
    # 10. Count the personal experience keywords
    keyword_count = sum(text.split().count(keyword) for keyword in personal_experience_keywords)

    # Extract URLs and calculate their length
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    URLs = re.findall(url_pattern, text)
    url_length = sum(len(url) for url in URLs)

    # 11. Classify based on the given conditions
    if keyword_count >= 2 and url_length <= 0.4 * len(text):
        return 'Personal Experience'
    else:
        return 'URL Suggestion'

In [None]:
#(3) Filter rows containing URLs
rows_with_urls = df_submission[df_submission['post'].apply(lambda x: bool(re.search(r'http[s]?://', str(x))))]
# Apply the classify_post function
rows_with_urls['Classification'] = rows_with_urls['post'].apply(classify_post)

# Count the Personal experience posts and URL suggestions
classification_counts = rows_with_urls['Classification'].value_counts()
df_submission= df_submission.drop(rows_with_urls[rows_with_urls['Classification'] == 'URL Suggestion'].index)
# Reset index from 0 sequentially
df_submission.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_urls['Classification'] = rows_with_urls['post'].apply(classify_post)


In [None]:
# rows in the remaining dataset
remaining_number_of_rows_after_deletion = len(df_submission)
remaining_number_of_rows_after_deletion

59962

**Process Comments**

In [None]:
# (1) Delete rows with both "u/[deleted]" in "Username" and either "[deleted]" or "[removed]" in "Post_Content" from comments
overlap_rows_indices_comments = df_comments[(df_comments['authorID'] == "[deleted]") & (df_comments['comment'].isin(["[deleted]", "[removed]"]))].index
df_comments = df_comments.drop(overlap_rows_indices_comments)
# Reset index from 0 to sequentially
df_comments = df_comments.reset_index(drop=True)

In [None]:
len(df_comments)

641441

In [None]:
df_comments.head()

Unnamed: 0,commentID,authorID,date,comment,parentID,linkID
0,c0uubmc,improbablywrong,2010-07-21 22:15:28,"I've tried several items on this list, like Me...",t3_cs80y,t3_cs80y
1,c0ux9q2,emmster,2010-07-22 20:41:36,"I eat a generally lower carb diet (That's ""die...",t3_cs80y,t3_cs80y
2,c0uxcw1,vixiera,2010-07-22 21:13:36,"I self-diagnosed myself a few years ago, but I...",t3_cs80y,t3_cs80y
3,c0uxxpr,improbablywrong,2010-07-23 00:55:28,"I haven't visited their forum for a while, but...",t3_csm11,t3_csm11
4,c0uy2d1,Amberkins72187,2010-07-23 01:48:48,I have had that site bookmarked forever. I hav...,t3_csm11,t3_csm11


**Count: Unique authors from entire dataset**

In [None]:
# Extract unique authors from posts and comments
unique_authors_submissions = set(df_submission['authorID'].unique())
unique_authors_comments = set(df_comments['authorID'].unique())

all_unique_authors = unique_authors_submissions.union(unique_authors_comments)

number_of_unique_authors = len(all_unique_authors)

print("Number of unique users who post and comment:", number_of_unique_authors)

Number of unique users who post and comment: 68010


In [None]:
# df is pandas dataframe
def dataframe_to_ndjson(df, file_path):
    # Convert pandas Timestamp to string to handle datetime serialization
    df = df.copy()

    # Convert all datetime columns to strings
    for col in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].astype(str)

    # Write the DataFrame to NDJSON
    with open(file_path, 'w') as f:
        for record in df.to_dict(orient='records'):
            json_record = json.dumps(record)
            f.write(json_record + '\n')

In [None]:
# function call and save dataframe to ndjson
dataframe_to_ndjson(df_submission, 'drive/My Drive/Projects/LSE-Symptomatology/Data/PrimaryLabeled_Dataset(Post)')
dataframe_to_ndjson(df_comments, 'drive/My Drive/Projects/LSE-Symptomatology/Data/PrimaryLabeled_Dataset(Comment)')

In [None]:
# top-most frequent flairs in the dataset
top_flairs= df_submission['flairName'].value_counts().head(10)

In [None]:
top_flairs


Unnamed: 0_level_0,count
flairName,Unnamed: 1_level_1
General/Advice,14180
Meds/Supplements,7218
Period,3742
Rant/Venting,2933
PLEASE ADD FLAIR,2798
General Health,2776
Weight,2032
Hair Loss/Thinning,1872
Hirsutism,1631
Fertility,1512


In [None]:
# Filter out the top 4 flair entries for further analyze
top_flairs_filtered = df_submission[df_submission['flairName'].isin(top_flairs.index[:4])]

In [None]:
# flairs we want to analyze
flairs = ['General/Advice', 'Meds/Supplements', 'Period', 'Rant/Venting']

In [None]:
def process_filename(flair_name):
    return re.sub(r'[^\w\s-]', '', flair_name).replace(" ", "_")

# Saving each flair's data to a separate CSV file
for flair in flairs:
    flair_data = top_flairs_filtered[top_flairs_filtered['flairName'] == flair]
    processed_name = process_filename(flair)
    flair_data.to_csv(f'drive/My Drive/Projects/LSE-Symptomatology/Data/{processed_name}_flair.csv', index=False)