In [1]:
import pandas as pd

df_pha = pd.read_csv('datasets/Psychosocial_Health_Analysis.csv')
df_pha['problem_category'] = df_pha['problem_category'].str.lower().str.strip()
df_pha['psychological_catehory'] = df_pha['psychological_catehory'].str.lower().str.strip()

df_pha = df_pha[
    (df_pha['problem_category'] == 'stress') | 
    (df_pha['problem_category'] == 'anxiety') |
    (df_pha['psychological_catehory'] == 'stress') |
    (df_pha['psychological_catehory'] == 'anxiety')
    ]

df_pha['label'] = 1
df_pha = df_pha.rename(columns={'Problem_description': 'text'})

df_pha = df_pha.drop(['Age', 'Gender', 'problem_summary', 'problem_category', 'psychological_catehory'], axis=1)

df_pha.head()

Unnamed: 0,text,label
0,A woman is sharing their challenges as the on...,1
3,"How to increase confidence? Feeling insecure, ...",1
5,"The user is in a complicated situation, caught...",1
6,The user was previously in a relationship but ...,1
8,The user is experiencing emotional turmoil due...,1


In [2]:
df_stress = pd.read_csv('datasets/Stress.csv')

df_stress = df_stress.drop(['subreddit','post_id','sentence_range','confidence','social_timestamp'], axis=1)

df_stress.head()

Unnamed: 0,text,label
0,"He said he had not felt that way before, sugge...",1
1,"Hey there r/assistance, Not sure if this is th...",0
2,My mom then hit me with the newspaper and it s...,1
3,"until i met my new boyfriend, he is amazing, h...",1
4,October is Domestic Violence Awareness Month a...,1


In [3]:
df_mhc = pd.read_csv('datasets/NLP_Mental_Health_Conversations.csv')

df_mhc['label'] = 1
df_mhc = df_mhc.rename(columns={'Context': 'text'})

df_mhc = df_mhc.drop('Response', axis=1)
df_mhc = df_mhc.drop_duplicates()

df_mhc.head()

Unnamed: 0,text,label
0,I'm going through some things with my feelings...,1
23,I have so many issues to address. I have a his...,1
70,I have been feeling more and more down for ove...,1
72,I’m facing severe depression and anxiety and I...,1
81,How can I get to a place where I can be conten...,1


In [4]:
# Read in reddit posts from subreddits that are deemed positive. Note: Some datasets are too large, and are split into parts instead.
import re

filepaths = [
    ("datasets/reddit/MadeMeSmile_submissions00.zip", 90),
    ("datasets/reddit/MadeMeSmile_submissions01.zip", 90),    
    ("datasets/reddit/MadeMeSmile_submissions02.zip", 90),
    ("datasets/reddit/MadeMeSmile_submissions03.zip", 90),    
    ("datasets/reddit/MadeMeSmile_submissions04.zip", 90),    
    ("datasets/reddit/MadeMeSmile_submissions05.zip", 90),
    ("datasets/reddit/wholesome_submissions.zip", 600)    
]

df_positive = pd.DataFrame(columns=['selftext'])

for f in filepaths:
    df_temp = pd.read_json(f[0], lines=True)[['selftext']]
    df_temp['selftext'] = df_temp['selftext'].apply(lambda x: re.sub(r'http\S+', '', x))
    df_temp = df_temp[
        (df_temp['selftext'] != '') & 
        (df_temp['selftext'] != '[deleted]') & 
        (df_temp['selftext'] != '[removed]') &
        (df_temp['selftext'].apply(lambda x: len(x.split()) >= 10))
    ].sample(f[1])

    df_positive = pd.concat([df_positive, df_temp], ignore_index=True)

df_positive = df_positive.reset_index(drop=True)

df_positive['label'] = 0
df_positive = df_positive.rename(columns={'selftext' : 'text'})

df_positive.head()

Unnamed: 0,text,label
0,This post will be posted here for the rest of ...,0
1,I very often get lonely and pensive during the...,0
2,I can be super sappy but I’ll sore you the who...,0
3,Shes 40 last week.\n\nWe had a talk about grow...,0
4,"I work at a bank, and I was just helping a wom...",0


In [5]:
# Combine all datasets into a single dataframe
df = pd.concat([df_pha, df_stress, df_mhc, df_positive], ignore_index=True)
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,text,label
0,a woman is sharing their challenges as the onl...,1
1,"how to increase confidence? feeling insecure, ...",1
2,"the user is in a complicated situation, caught...",1
3,the user was previously in a relationship but ...,1
4,the user is experiencing emotional turmoil due...,1


In [65]:
df.sample(10)

Unnamed: 0,text,label
3018,"when i'm around people, i sometimes think some...",1
267,post-text: don't bother with this part that's ...,0
3622,i got sick really bad and was throwing up for ...,1
1636,i'm not polyamorous at all and he never mentio...,1
3973,you’re all doing an amazing job! to everyone d...,0
4492,so a while ago i had some money from my birthd...,0
1450,when i was eighteen i lived with my father as ...,0
2863,"even when she hated me, i didn't hate her. the...",1
677,i feel like i have no personality or sense of ...,1
4630,83 year old grandmother showed up to the 4th o...,0


In [92]:
import nltk
import re
from urllib.parse import urlparse
from spacy import load
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('omw-1.4')
nltk.download('wordnet') 
nltk.download('wordnet2022')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package omw-1.4 to /home/brian/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/brian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to
[nltk_data]     /home/brian/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package punkt to /home/brian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/brian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/brian/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [94]:
# Clean up and lemmatize the texts.

def process_texts(text):

    # Strip and convert text to lower case
    text = text.strip().lower()

    # Remove urls
    text = re.sub(r'http\S+', '', text)
    
    # Remove escape characters
    text = re.sub(r'\@\w+', '', text)

    # Remove emojis
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    text = re.sub(emoji, '', text)

    # Remove html tags 
    text = re.sub(r'<.*?>', '', text)
    
    # Remove new line in string form
    text = re.sub(r'\\n', '', text)

    # Replace brackets with space
    text = re.sub(r'[\]\[\)\(]', ' ', text)
    
    # Remove all characters except alphabets
    text = re.sub(r'[^A-Za-z]' ,' ', text)

    # Replace multiple whitespace with single space
    text = re.sub(r'\s\s+', ' ', text)

    # Tokenize the sentence
    tokens = word_tokenize(text)

    # Remove stop words
    for word in tokens:
        if word in stop_words:
            tokens.remove(word)

    # POS tag the words
    pos = pos_tag(tokens)

    # Lemmatize all words
    text = []
    for word, tag in pos:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        text.append(lemmatizer.lemmatize(word, wntag) if wntag else word)

    return ' '.join(text)

df['processed_text'] = df['text'].apply(lambda x: process_texts(x))

df.sample(10)
