In [67]:
import pandas as pd
import json
import re
from datetime import datetime

# # Load the JSON file
# with open('cleaning/reddit_submissions.json', 'r') as f:
#     data = json.load(f)

# submissions = data['submissions']

# # Create DataFrame from the submissions data
# df = pd.DataFrame(submissions)

# df = df[["id", "created_utc", "title", "selftext", "score", "upvote_ratio", "num_comments", "num_crossposts", "url"]]

# df.to_csv("reduced_reddit_posts.csv", index=False)

df = pd.read_csv("reduced_reddit_posts.csv")

df = df[(df["selftext"] != '') & (df["selftext"] != "[removed]") & (df["selftext"] != "[deleted]")].dropna(subset=["selftext"]).reset_index(drop=True)

df


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,num_crossposts,url
0,vdcl0,1340259437,is this thing on?,anybody awake?,3,,2,,http://www.reddit.com/r/Menopause/comments/vdc...
1,12bic1,1351567414,Turned 40 last July and started have hot flash...,My mother said she went into full blown menopa...,1,,0,,http://www.reddit.com/r/Menopause/comments/12b...
2,13fiwk,1353294608,winter shoes for mom,my mom has osteoarthritis and i want to get he...,1,,0,,http://www.reddit.com/r/Menopause/comments/13f...
3,15jjqu,1356649991,Sex after Menopause?,"My wife went through this ""wonderful"" change, ...",7,,3,,http://www.reddit.com/r/Menopause/comments/15j...
4,19wwpb,1362754970,How does one live with someone going through M...,My wife (early 50s and has been diagnosed by h...,3,,6,,http://www.reddit.com/r/Menopause/comments/19w...
...,...,...,...,...,...,...,...,...,...
11597,1004teb,1672524104,I'm so over this moodiness,I'm in peri and have been for some time. Moodi...,33,0.96,20,0.0,https://www.reddit.com/r/Menopause/comments/10...
11598,10058dp,1672525367,How can we make 2023 kinder to us than 2022?,What are non-medicated ways to ease the depres...,14,1.00,17,0.0,https://www.reddit.com/r/Menopause/comments/10...
11599,1005nki,1672526636,Hot flashes,I'm 54 post meno and taking estrodot 75 mg and...,3,1.00,3,0.0,https://www.reddit.com/r/Menopause/comments/10...
11600,1006cfo,1672528757,Perimenopause - change in vaginal odor?,"Hi everyone,\n\nHas anyone else noticed a chan...",23,0.97,24,0.0,https://www.reddit.com/r/Menopause/comments/10...


In [68]:
def epoch_to_date(epoch_time):
    """
    Convert epoch time to datetime object with only year, month, and day.

    :param epoch_time: Epoch time in seconds
    :type epoch_time: int or float
    :return: Date string in the format 'YYYY-MM-DD'
    :rtype: str
    """
    dt = datetime.fromtimestamp(epoch_time)
    return dt.strftime('%Y-%m-%d')

df["created_utc"] = df["created_utc"].apply(epoch_to_date)

In [69]:
# extract age information
def extract_age(text):
    # Regular expression pattern to extract age information
    age_pattern = r"(?:\b(?:I\s*am|I'm|Im|age|aged|old|young|turn|turning|turned|early|mid|late)\b\s*(\d{1,2})\s*(?:years?|yrs?|s|yo)?\b|\b(\d{1,2})\s*(?:years?|yrs?|yo)\b)"
    match = re.search(age_pattern, text, re.IGNORECASE)
    if match:
        age_group = match.group(1) or match.group(2)
        if age_group:
            return int(age_group)
    else:
        return None


df['title_age'] = df['title'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)
df['selftext_age'] = df['selftext'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)

# Compare age information from title and selftext, take the larger value
def choose_age(row):
    if pd.notnull(row['title_age']) and pd.notnull(row['selftext_age']):
        return max(row['title_age'], row['selftext_age'])
    elif pd.notnull(row['title_age']):
        return row['title_age']
    elif pd.notnull(row['selftext_age']):
        return row['selftext_age']
    else:
        return None

# Create 'age' column with the larger age value from title or selftext
df['age'] = df.apply(choose_age, axis=1)

# Drop intermediate columns
df.drop(['title_age', 'selftext_age'], axis=1, inplace=True)

In [70]:
def find_perimenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((peri|pre)[- ]?(menop(ausal|ause))?)\b'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    return bool(match)


df['perimenopause'] = df['selftext'].apply(find_perimenopause) | df["title"].apply(find_perimenopause)


In [71]:
def find_postmenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((post)[- ]?(menop(ausal|ause))|after\s+men(?:opause)?)\b'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    
    return bool(match)

def check_both_menopause(row):
    if row['perimenopause'] and row['postmenopause']:
        row['perimenopause'] = False
    return row

df['postmenopause'] = df['selftext'].apply(find_postmenopause) | df["title"].apply(find_postmenopause)
df = df.apply(check_both_menopause, axis=1)

In [72]:
# hot flashes or night sweats
def search_hot_flashes(text):
    # Regular expression pattern to search for variations of hot flashes
    hot_flashes_pattern = r'\b(?:hot\s*flashes?|hot\s*flash?|flash|flashes|heat|hot|sweat|sweating|sweats|night\s*sweat?|night\s*sweats?)\b'
    match = re.search(hot_flashes_pattern, text, re.IGNORECASE)
    if match:
        return True
    else:
        return False

df['hot_flashes/night_sweats'] = df['title'].apply(search_hot_flashes) | df['selftext'].apply(search_hot_flashes)

In [73]:
# sleeping issues, insomnia
def sleep_disorder(text):
    pattern = r'sleep(?:ing)?|insomni(?:a|ac)'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["sleep_disorder"] = df['selftext'].apply(sleep_disorder) | df["title"].apply(sleep_disorder)

In [76]:
# Search for mentions of depression
def depression(text):
    
    # Define the regex pattern to search for mentions of depression
    pattern = r'\bdepres'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["depression"] = df['selftext'].apply(depression) | df["title"].apply(depression)

In [81]:
# Search for mentions of anxiety
def anxiety(text):
    
    # Define the regex pattern to search for mentions of anxiety
    pattern = r'\banxi'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["anxiety"] = df['selftext'].apply(anxiety) | df["title"].apply(anxiety)

In [82]:
print(df[df["id"] == "t83sca"])
print(len(df[(df["sleep_disorder"] == True)]))
df

          id created_utc                                     title  \
6680  t83sca  2022-03-06  anxiety - where could it be coming from?   

                                               selftext  score  upvote_ratio  \
6680  I started 100mg vaginal progesterone a few mon...     10          0.79   

      num_comments  num_crossposts  \
6680            29             0.0   

                                                    url  age  perimenopause  \
6680  https://www.reddit.com/r/Menopause/comments/t8...  NaN          False   

      postmenopause  hot_flashes/night_sweats  sleep_disorder  depression  \
6680          False                     False           False        True   

      anxiety  
6680     True  
2022


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,num_crossposts,url,age,perimenopause,postmenopause,hot_flashes/night_sweats,sleep_disorder,depression,anxiety
0,vdcl0,2012-06-20,is this thing on?,anybody awake?,3,,2,,http://www.reddit.com/r/Menopause/comments/vdc...,,False,False,False,False,False,False
1,12bic1,2012-10-29,Turned 40 last July and started have hot flash...,My mother said she went into full blown menopa...,1,,0,,http://www.reddit.com/r/Menopause/comments/12b...,40.0,False,False,True,False,False,False
2,13fiwk,2012-11-18,winter shoes for mom,my mom has osteoarthritis and i want to get he...,1,,0,,http://www.reddit.com/r/Menopause/comments/13f...,,False,False,False,False,False,False
3,15jjqu,2012-12-27,Sex after Menopause?,"My wife went through this ""wonderful"" change, ...",7,,3,,http://www.reddit.com/r/Menopause/comments/15j...,,False,True,False,False,False,False
4,19wwpb,2013-03-08,How does one live with someone going through M...,My wife (early 50s and has been diagnosed by h...,3,,6,,http://www.reddit.com/r/Menopause/comments/19w...,50.0,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11597,1004teb,2022-12-31,I'm so over this moodiness,I'm in peri and have been for some time. Moodi...,33,0.96,20,0.0,https://www.reddit.com/r/Menopause/comments/10...,,True,False,False,False,False,False
11598,10058dp,2022-12-31,How can we make 2023 kinder to us than 2022?,What are non-medicated ways to ease the depres...,14,1.00,17,0.0,https://www.reddit.com/r/Menopause/comments/10...,,False,False,False,False,True,True
11599,1005nki,2022-12-31,Hot flashes,I'm 54 post meno and taking estrodot 75 mg and...,3,1.00,3,0.0,https://www.reddit.com/r/Menopause/comments/10...,54.0,False,False,True,False,False,False
11600,1006cfo,2022-12-31,Perimenopause - change in vaginal odor?,"Hi everyone,\n\nHas anyone else noticed a chan...",23,0.97,24,0.0,https://www.reddit.com/r/Menopause/comments/10...,,True,False,True,False,False,False
