In [259]:
import pandas as pd
import numpy as np
import json
import re
from datetime import datetime

# # Load the JSON file
# with open('cleaning/reddit_submissions.json', 'r') as f:
#     data = json.load(f)

# submissions = data['submissions']

# # Create DataFrame from the submissions data
# df = pd.DataFrame(submissions)

# df = df[["id", "created_utc", "title", "selftext", "score", "upvote_ratio", "num_comments", "num_crossposts", "url"]]

# df.to_csv("reduced_reddit_posts.csv", index=False)

df = pd.read_csv("reduced_reddit_posts.csv")

df = df[(df["selftext"] != '') & (df["selftext"] != "[removed]") & (df["selftext"] != "[deleted]")].dropna(subset=["selftext"]).reset_index(drop=True)
df = df.drop("num_crossposts", axis=1)
df


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,url
0,vdcl0,1340259437,is this thing on?,anybody awake?,3,,2,http://www.reddit.com/r/Menopause/comments/vdc...
1,12bic1,1351567414,Turned 40 last July and started have hot flash...,My mother said she went into full blown menopa...,1,,0,http://www.reddit.com/r/Menopause/comments/12b...
2,13fiwk,1353294608,winter shoes for mom,my mom has osteoarthritis and i want to get he...,1,,0,http://www.reddit.com/r/Menopause/comments/13f...
3,15jjqu,1356649991,Sex after Menopause?,"My wife went through this ""wonderful"" change, ...",7,,3,http://www.reddit.com/r/Menopause/comments/15j...
4,19wwpb,1362754970,How does one live with someone going through M...,My wife (early 50s and has been diagnosed by h...,3,,6,http://www.reddit.com/r/Menopause/comments/19w...
...,...,...,...,...,...,...,...,...
11597,1004teb,1672524104,I'm so over this moodiness,I'm in peri and have been for some time. Moodi...,33,0.96,20,https://www.reddit.com/r/Menopause/comments/10...
11598,10058dp,1672525367,How can we make 2023 kinder to us than 2022?,What are non-medicated ways to ease the depres...,14,1.00,17,https://www.reddit.com/r/Menopause/comments/10...
11599,1005nki,1672526636,Hot flashes,I'm 54 post meno and taking estrodot 75 mg and...,3,1.00,3,https://www.reddit.com/r/Menopause/comments/10...
11600,1006cfo,1672528757,Perimenopause - change in vaginal odor?,"Hi everyone,\n\nHas anyone else noticed a chan...",23,0.97,24,https://www.reddit.com/r/Menopause/comments/10...


In [260]:
def epoch_to_date(epoch_time):
    """
    Convert epoch time to datetime object with only year, month, and day.

    :param epoch_time: Epoch time in seconds
    :type epoch_time: int or float
    :return: Date string in the format 'YYYY-MM-DD'
    :rtype: str
    """
    dt = datetime.fromtimestamp(epoch_time)
    return dt.strftime('%Y-%m-%d')

df["created_utc"] = df["created_utc"].apply(epoch_to_date)

In [316]:
# extract age information
def extract_age(text):
    # Regular expression pattern to extract age information
    age_pattern = r"(?:\b(?:I\s*am|I'?m|am|I'?m\s*at|she'?s|she\s*is|age|aged|turn|turning|turned|early|mid|late)\b\s*(\d{2})\s*(?:years?\s*old|yrs?\s*old|y/o|s|yo|f)?\b|\b(\d{2})\s*(?:years?\s*old|yrs?\s*old|y/o|f)\b)"
    match = re.findall(age_pattern, text, re.IGNORECASE)
    if match:
        maximum = int(max(max(match)))

        if maximum < 15 or maximum > 85:
            return None

        return maximum
    else:
        return None


df['title_age'] = df['title'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)
df['selftext_age'] = df['selftext'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)

# Compare age information from title and selftext, take the larger value
def choose_age(row):
    if pd.notnull(row['title_age']) and pd.notnull(row['selftext_age']):
        return max(row['title_age'], row['selftext_age'])
    elif pd.notnull(row['title_age']):
        return row['title_age']
    elif pd.notnull(row['selftext_age']):
        return row['selftext_age']
    else:
        return None

# Create 'age' column with the larger age value from title or selftext
df['age'] = df.apply(choose_age, axis=1)

# Drop intermediate columns
df.drop(['title_age', 'selftext_age'], axis=1, inplace=True)

df["age"] = df["age"].fillna(np.nan)


In [262]:
def find_perimenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((peri|pre)[- ]?(menop(ausal|ause))?)\b'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    return bool(match)


df['perimenopause'] = df['selftext'].apply(find_perimenopause) | df["title"].apply(find_perimenopause)


In [263]:
def find_postmenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((post)[- ]?(menop(ausal|ause))|after\s+men(?:opause)?)\b|hyster'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    return bool(match)

def check_both_menopause(row):
    if row['perimenopause'] and row['postmenopause']:
        row['perimenopause'] = False
    return row

df['postmenopause'] = df['selftext'].apply(find_postmenopause) | df["title"].apply(find_postmenopause)
df = df.apply(check_both_menopause, axis=1)

In [264]:
# hot flashes or night sweats
def search_hot_flashes(text):
    # Regular expression pattern to search for variations of hot flashes
    hot_flashes_pattern = r'\b(?:hot\s*flashes?|hot\s*flash?|flash|flashes|hot|sweat|sweating|sweats|heat|night\s*sweats?)'
    match = re.search(hot_flashes_pattern, text, re.IGNORECASE)
    if match:
        return True
    else:
        return False

df['hot_flashes/night_sweats'] = df['title'].apply(search_hot_flashes) | df['selftext'].apply(search_hot_flashes)

In [265]:
# sleeping issues, insomnia
def sleep_disorder(text):
    pattern = r'sleep(?:ing)?|insomni(?:a|ac)'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["sleep_disorder"] = df['selftext'].apply(sleep_disorder) | df["title"].apply(sleep_disorder)

In [266]:
# Search for mentions of depression
def depression(text):
    
    # Define the regex pattern to search for mentions of depression
    pattern = r'\bdepres'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["depression"] = df['selftext'].apply(depression) | df["title"].apply(depression)

In [267]:
# Search for mentions of anxiety
def anxiety(text):
    
    # Define the regex pattern to search for mentions of anxiety
    pattern = r'\banxi'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["anxiety"] = df['selftext'].apply(anxiety) | df["title"].apply(anxiety)

In [268]:
# Search for mentions of mood problems
def mood(text):
    
    # Define the regex pattern to search for mentions of anxiety
    pattern = r'\bmood|rage|raging|bitch|suici|emotion'
    
    # Search for the pattern in the text
    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["mood_problems"] = df['selftext'].apply(mood) | df["title"].apply(mood)

In [269]:
def sexual_activity(text):
    pattern = r'\bsex|orgasm|libido'

     # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["sexual_problems"] = df['selftext'].apply(sexual_activity) | df["title"].apply(sexual_activity)

In [270]:
def vaginal_dryness(text):
    vag_pattern = r'\bvagin'
    dry_pattern = r'\bdry|burn|itch|irrit'
     # Search for the pattern in the text
    
    # Return True if any match is found, else False
    return bool(re.search(vag_pattern, text, re.IGNORECASE) and re.search(dry_pattern, text, re.IGNORECASE))

df["vaginal_dryness"] = df['selftext'].apply(vaginal_dryness) | df["title"].apply(vaginal_dryness)

In [271]:
def aches_pains(text):
    pattern = r"\bpain|ach|hurt|joint|arthri"

    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["aches_and_pains"] = df['selftext'].apply(aches_pains) | df["title"].apply(aches_pains)    

In [272]:
def skin_problems(text):
    pattern = r'\bskin|acne|sag|wrinkle'

    return bool(re.search(pattern, text, re.IGNORECASE))

df["skin_problems"] = df['selftext'].apply(skin_problems) | df["title"].apply(skin_problems)

In [273]:
def bladder_control_problems(text):
    pattern = r"\bpee|bladder|urine"

    return bool(re.search(pattern, text, re.IGNORECASE))

df["bladder_control_problems"] = df['selftext'].apply(bladder_control_problems) | df["title"].apply(bladder_control_problems)

In [274]:
def heavy_irregular_periods(text):
    # Define the regex pattern to search for "period" and "heavy" in any order
    period_pattern = r"\bperiod|bleed|bled|blood|cycle|clot"
    period_match = re.search(period_pattern, text, re.IGNORECASE)
    
    # heavy_irreg_pattern = r"\bheav|irreg"
    # heavy_irreg_match = re.search(heavy_irreg_pattern, text, re.IGNORECASE)
    # Check if the pattern is found in the text

    return bool(period_match)

df["heavy/irregular_periods"] = df["selftext"].apply(heavy_irregular_periods) | df["title"].apply(heavy_irregular_periods)

In [275]:
def brain_fog(text):
    pattern = r"\bbrain\s*fog|forget"

    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["brain_fog"] = df['selftext'].apply(brain_fog) | df["title"].apply(brain_fog)

In [276]:
def weight_gain(text):
    pattern = r"\bweight"

    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["weight_gain"] = df['selftext'].apply(weight_gain) | df["title"].apply(weight_gain)

In [277]:
def hormone(text):
    pattern = r"\b(patch|dhea|hrt|estrace|estradiol|estro|estriol|progesterone|testo|steroid|hormone\s*repl|hormone\s*thera)"

    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["hormone_therapy"] = df['selftext'].apply(hormone) | df["title"].apply(hormone)

In [278]:
def antidepressants(text):
    pattern = r"\bantide|ssri|snri|zoloft|prozac|paxil|pexeva|celexa|lexapro|Cymbalta|Drizalma|effexor|Pristiq|fetzima"

    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["antidepressants"] = df['selftext'].apply(antidepressants) | df["title"].apply(antidepressants)

In [279]:
def vitamins(text):
    pattern = r"\bvitamin"

    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["vitamins"] = df['selftext'].apply(vitamins) | df["title"].apply(vitamins)

In [280]:
def exercise(text):
    pattern = r"\bexerci"

    matches = re.search(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["exercise"] = df['selftext'].apply(exercise) | df["title"].apply(exercise)

In [317]:
print(df[df["id"] == "mh5pcl"])
print(len(df[(df["bladder_control_problems"] == True)]))

df[df["age"] > 70]
# filtered_df = df[(~df.iloc[:, 9:]).all(axis=1)]
# filtered_df.iloc[500:550]


          id created_utc                                           title  \
3888  mh5pcl  2021-03-31  Anyone going through peri/meno with a toddler?   

                                               selftext  score  upvote_ratio  \
3888  I feel hopeless right now, I thought he would ...     21          0.94   

      num_comments                                                url  age  \
3888            36  https://www.reddit.com/r/Menopause/comments/mh...  NaN   

      perimenopause  ...  aches_and_pains  skin_problems  \
3888           True  ...            False          False   

      bladder_control_problems  heavy/irregular_periods  brain_fog  \
3888                     False                    False      False   

      weight_gain  hormone_therapy  antidepressants  vitamins  exercise  
3888        False            False            False     False     False  

[1 rows x 28 columns]
366


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,url,age,perimenopause,...,aches_and_pains,skin_problems,bladder_control_problems,heavy/irregular_periods,brain_fog,weight_gain,hormone_therapy,antidepressants,vitamins,exercise
765,a7dz37,2018-12-18,raise your hand if you're ever...,checked the thermostat to make sure it wasn't ...,48,,32,https://www.reddit.com/r/Menopause/comments/a7...,80.0,False,...,False,False,False,False,False,False,False,False,False,False
914,atlzy8,2019-02-22,"Joint pain, tendon pain...just feel like I'm 8...","55 years old. 5'5"" 130 lbs. Active and eat pre...",25,,26,https://www.reddit.com/r/Menopause/comments/at...,80.0,False,...,True,False,False,True,False,False,False,False,False,False
1278,ca4plb,2019-07-07,Today was enlightening,"It's not so much about menopause, even though ...",44,,20,https://www.reddit.com/r/Menopause/comments/ca...,85.0,False,...,True,False,False,False,False,False,False,False,False,False
1729,dt2mqb,2019-11-07,"“Menopause was so easy for me, I really don’t ...",My MIL is constantly “amazed” at what a diffic...,65,,40,https://www.reddit.com/r/Menopause/comments/dt...,75.0,True,...,False,False,False,False,False,False,False,False,False,False
2068,ey5a3j,2020-02-03,A way out for the open minded - Ayelet Waldman...,Microdosing psychedelic mushrooms is helping m...,22,,12,https://www.reddit.com/r/Menopause/comments/ey...,80.0,True,...,True,False,False,False,False,False,False,False,False,False
2357,gb1rmo,2020-04-30,Confused estrogen - too much? too little? AND ...,"Hello, everyone -\n\nI am desperate for answer...",0,0.5,0,https://www.reddit.com/r/Menopause/comments/gb...,80.0,False,...,True,False,False,True,False,True,True,False,False,True
2719,i63z6t,2020-08-08,"Follow up: 2 days on HRT, wow!",original post: https://www.reddit.com/r/Menop...,63,0.98,23,https://www.reddit.com/r/Menopause/comments/i6...,80.0,False,...,False,False,False,False,False,False,True,False,False,False
2990,j5dxbm,2020-10-04,At what age does HRT stop?,I'm wondering what people know/think about how...,6,1.0,8,https://www.reddit.com/r/Menopause/comments/j5...,82.0,False,...,False,False,False,False,False,False,True,False,False,False
4109,n0054k,2021-04-27,I miss dry underwear,Panty liners are fine for the occasional damp ...,60,0.98,62,https://www.reddit.com/r/Menopause/comments/n0...,80.0,False,...,False,False,False,True,False,False,False,False,False,False
4258,nf1g2j,2021-05-17,"""You have the vagina of a 75 year old""","Not sure where to start, a little heartbroken....",18,1.0,23,https://www.reddit.com/r/Menopause/comments/nf...,75.0,False,...,True,False,False,True,False,False,False,False,False,False
