In [205]:
import pandas as pd
import json
import re
from datetime import datetime

# # Load the JSON file
# with open('cleaning/reddit_submissions.json', 'r') as f:
#     data = json.load(f)

# submissions = data['submissions']

# # Create DataFrame from the submissions data
# df = pd.DataFrame(submissions)

# df = df[["id", "created_utc", "title", "selftext", "score", "upvote_ratio", "num_comments", "num_crossposts", "url"]]

# df.to_csv("reduced_reddit_posts.csv", index=False)

df = pd.read_csv("reduced_reddit_posts.csv")

df = df[(df["selftext"] != '') & (df["selftext"] != "[removed]") & (df["selftext"] != "[deleted]")].dropna(subset=["selftext"]).reset_index(drop=True)

df


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,num_crossposts,url
0,vdcl0,1340259437,is this thing on?,anybody awake?,3,,2,,http://www.reddit.com/r/Menopause/comments/vdc...
1,12bic1,1351567414,Turned 40 last July and started have hot flash...,My mother said she went into full blown menopa...,1,,0,,http://www.reddit.com/r/Menopause/comments/12b...
2,13fiwk,1353294608,winter shoes for mom,my mom has osteoarthritis and i want to get he...,1,,0,,http://www.reddit.com/r/Menopause/comments/13f...
3,15jjqu,1356649991,Sex after Menopause?,"My wife went through this ""wonderful"" change, ...",7,,3,,http://www.reddit.com/r/Menopause/comments/15j...
4,19wwpb,1362754970,How does one live with someone going through M...,My wife (early 50s and has been diagnosed by h...,3,,6,,http://www.reddit.com/r/Menopause/comments/19w...
...,...,...,...,...,...,...,...,...,...
11597,1004teb,1672524104,I'm so over this moodiness,I'm in peri and have been for some time. Moodi...,33,0.96,20,0.0,https://www.reddit.com/r/Menopause/comments/10...
11598,10058dp,1672525367,How can we make 2023 kinder to us than 2022?,What are non-medicated ways to ease the depres...,14,1.00,17,0.0,https://www.reddit.com/r/Menopause/comments/10...
11599,1005nki,1672526636,Hot flashes,I'm 54 post meno and taking estrodot 75 mg and...,3,1.00,3,0.0,https://www.reddit.com/r/Menopause/comments/10...
11600,1006cfo,1672528757,Perimenopause - change in vaginal odor?,"Hi everyone,\n\nHas anyone else noticed a chan...",23,0.97,24,0.0,https://www.reddit.com/r/Menopause/comments/10...


In [206]:
def epoch_to_date(epoch_time):
    """
    Convert epoch time to datetime object with only year, month, and day.

    :param epoch_time: Epoch time in seconds
    :type epoch_time: int or float
    :return: Date string in the format 'YYYY-MM-DD'
    :rtype: str
    """
    dt = datetime.fromtimestamp(epoch_time)
    return dt.strftime('%Y-%m-%d')

df["created_utc"] = df["created_utc"].apply(epoch_to_date)

In [207]:
# extract age information
def extract_age(text):
    # Regular expression pattern to extract age information
    age_pattern = r"(?:\b(?:I\s*am|I'm|Im|age|aged|old|young|turn|turning|turned|early|mid|late)\b\s*(\d{1,2})\s*(?:years?|yrs?|s|yo|f|m)?\b|\b(\d{1,2})\s*(?:years?|yrs?|yo|f|m)\b)"
    match = re.search(age_pattern, text, re.IGNORECASE)
    if match:
        age_group = match.group(1) or match.group(2)
        if age_group:
            return int(age_group)
    else:
        return None


df['title_age'] = df['title'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)
df['selftext_age'] = df['selftext'].apply(lambda x: extract_age(x) if pd.notnull(x) else None)

# Compare age information from title and selftext, take the larger value
def choose_age(row):
    if pd.notnull(row['title_age']) and pd.notnull(row['selftext_age']):
        return max(row['title_age'], row['selftext_age'])
    elif pd.notnull(row['title_age']):
        return row['title_age']
    elif pd.notnull(row['selftext_age']):
        return row['selftext_age']
    else:
        return None

# Create 'age' column with the larger age value from title or selftext
df['age'] = df.apply(choose_age, axis=1)

# Drop intermediate columns
df.drop(['title_age', 'selftext_age'], axis=1, inplace=True)

In [208]:
def find_perimenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((peri|pre)[- ]?(menop(ausal|ause))?)\b'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    return bool(match)


df['perimenopause'] = df['selftext'].apply(find_perimenopause) | df["title"].apply(find_perimenopause)


In [209]:
def find_postmenopause(text):

    # Define the regex pattern to search for perimenopause and postmenopause mentions
    menopause_pattern = r'\b((post)[- ]?(menop(ausal|ause))|after\s+men(?:opause)?)\b'
    
    # Find all matches of the pattern in the text
    match = re.search(menopause_pattern, text, re.IGNORECASE)
    
    return bool(match)

def check_both_menopause(row):
    if row['perimenopause'] and row['postmenopause']:
        row['perimenopause'] = False
    return row

df['postmenopause'] = df['selftext'].apply(find_postmenopause) | df["title"].apply(find_postmenopause)
df = df.apply(check_both_menopause, axis=1)

In [210]:
# hot flashes or night sweats
def search_hot_flashes(text):
    # Regular expression pattern to search for variations of hot flashes
    hot_flashes_pattern = r'\b(?:hot\s*flashes?|hot\s*flash?|flash|flashes|heat|hot|sweat|sweating|sweats|night\s*sweat?|night\s*sweats?)\b'
    match = re.search(hot_flashes_pattern, text, re.IGNORECASE)
    if match:
        return True
    else:
        return False

df['hot_flashes/night_sweats'] = df['title'].apply(search_hot_flashes) | df['selftext'].apply(search_hot_flashes)

In [211]:
# sleeping issues, insomnia
def sleep_disorder(text):
    pattern = r'sleep(?:ing)?|insomni(?:a|ac)'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["sleep_disorder"] = df['selftext'].apply(sleep_disorder) | df["title"].apply(sleep_disorder)

In [212]:
# Search for mentions of depression
def depression(text):
    
    # Define the regex pattern to search for mentions of depression
    pattern = r'\bdepres'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["depression"] = df['selftext'].apply(depression) | df["title"].apply(depression)

In [213]:
# Search for mentions of anxiety
def anxiety(text):
    
    # Define the regex pattern to search for mentions of anxiety
    pattern = r'\banxi'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["anxiety"] = df['selftext'].apply(anxiety) | df["title"].apply(anxiety)

In [214]:
# Search for mentions of mood problems
def mood(text):
    
    # Define the regex pattern to search for mentions of anxiety
    pattern = r'\bmood'
    
    # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["mood_problems"] = df['selftext'].apply(mood) | df["title"].apply(mood)

In [215]:
def sexual_activity(text):
    pattern = r'\bsex|orgasm|libido'

     # Search for the pattern in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["sexual_problems"] = df['selftext'].apply(sexual_activity) | df["title"].apply(sexual_activity)

In [216]:
def aches_pains(text):
    pattern = r"\bpain|ach|hurt"

    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)

df["aches_and_pains"] = df['selftext'].apply(aches_pains) | df["title"].apply(aches_pains)    

In [217]:
def heavy_irregular_periods(text):
    # Define the regex pattern to search for "period" and "heavy" in any order
    period_pattern = r"\bperiod|bleed|cycle|clot"
    period_match = re.search(period_pattern, text, re.IGNORECASE)
    
    heavy_irreg_pattern = r"\bheav|irreg"
    heavy_irreg_match = re.search(heavy_irreg_pattern, text, re.IGNORECASE)
    # Check if the pattern is found in the text

    return bool(period_match and heavy_irreg_match)

df["heavy/irregular_periods"] = df["selftext"].apply(heavy_irregular_periods) | df["title"].apply(heavy_irregular_periods)

In [218]:
def hormone(text):
    pattern = r"\bpatch|hrt|estradiol|estro|progesterone|testo|steroid|hormone\s*repl|hormone\s*thera"

    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["hormone_therapy"] = df['selftext'].apply(mood) | df["title"].apply(mood)

In [219]:
def antidepressants(text):
    pattern = r"\bantide|ssri|snri|zoloft|prozac|paxil|pexeva|celexa|lexapro|Cymbalta|Drizalma|effexor|Pristiq|fetzima"

    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Return True if any match is found, else False
    return bool(matches)
df["antidepressants"] = df['selftext'].apply(antidepressants) | df["title"].apply(antidepressants)

In [226]:
print(df[df["id"] == "rt7m8a"])
print(len(df[(df["heavy/irregular_periods"] == True)]))
df[(df["sexual_problems"] == True)]

          id created_utc                                 title  \
6073  rt7m8a  2021-12-31  I bled everywhere and I feel better.   

                                               selftext  score  upvote_ratio  \
6073  This is linked to earlier posts. I've had insa...    144          0.99   

      num_comments  num_crossposts  \
6073            37             0.0   

                                                    url  age  ...  \
6073  https://www.reddit.com/r/Menopause/comments/rt...  NaN  ...   

      hot_flashes/night_sweats  sleep_disorder  depression  anxiety  \
6073                     False           False       False    False   

      mood_problems  sexual_problems  aches_and_pains  \
6073          False            False             True   

      heavy/irregular_periods  hormone_therapy  antidepressants  
6073                     True            False            False  

[1 rows x 22 columns]
960


Unnamed: 0,id,created_utc,title,selftext,score,upvote_ratio,num_comments,num_crossposts,url,age,...,hot_flashes/night_sweats,sleep_disorder,depression,anxiety,mood_problems,sexual_problems,aches_and_pains,heavy/irregular_periods,hormone_therapy,antidepressants
3,15jjqu,2012-12-27,Sex after Menopause?,"My wife went through this ""wonderful"" change, ...",7,,3,,http://www.reddit.com/r/Menopause/comments/15j...,,...,False,False,False,False,False,True,False,False,False,False
5,1bzh5x,2013-04-09,How to live with yourself going through menopause,I've learned that every woman has a different ...,3,,8,,http://www.reddit.com/r/Menopause/comments/1bz...,55.0,...,True,False,False,False,False,True,False,False,False,False
8,1jfo3w,2013-07-31,"I feel like such a whiny bitch about this, but...","I'm 49, and menopause started last year. At fi...",8,,7,,http://www.reddit.com/r/Menopause/comments/1jf...,49.0,...,True,True,True,False,False,True,True,False,False,True
16,1quvya,2013-11-17,Perimenopausal Rage,"Early 50s. I've had lengthy, but regular perio...",10,,7,,http://www.reddit.com/r/Menopause/comments/1qu...,50.0,...,True,False,False,False,True,True,False,False,True,False
23,1vvnp2,2014-01-22,Looking for women 45+ for an online survey abo...,"Hello, my name is Heather VanZuylen and I'm a ...",3,,4,,http://www.reddit.com/r/Menopause/comments/1vv...,,...,True,True,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11573,zz9zl1,2022-12-30,Stem cell transplant/medically induced menopause,I'm currently mid process of a stem cell trans...,5,1.0,3,0.0,https://www.reddit.com/r/Menopause/comments/zz...,,...,False,False,False,False,False,True,False,False,False,False
11580,zzi4xj,2022-12-30,Rant incoming (sorry)!,"Ladies— I don't know where to turn, I'm sorry ...",16,1.0,23,0.0,https://www.reddit.com/r/Menopause/comments/zz...,55.0,...,True,False,False,True,False,True,False,False,False,False
11587,zzvmji,2022-12-31,Hi guys! Penetration has become sore:-(,"Hi, for a long while now, since menopause star...",8,1.0,12,0.0,https://www.reddit.com/r/Menopause/comments/zz...,7.0,...,False,False,False,False,False,True,False,False,False,False
11590,zzyf9q,2022-12-31,Been struggling a lot lately - I'm 32 and neve...,"I'm exhausted. I was so driven before, I rolle...",20,1.0,21,0.0,https://www.reddit.com/r/Menopause/comments/zz...,32.0,...,True,False,True,False,True,True,True,False,True,False
