# MindInsight Classifier: Unveiling Mental Health Patterns in Pandemic Discourse through Data-Driven Analysis

Let us first import the pertinent libraries.

In [70]:
#!pip install wordcloud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('mental_disorders_reddit.csv')

In [3]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD
4,help,[removed],1650350907,False,BPD


### Data Preprocessing and Simple EDA (Part 1)

In [4]:
print(df.shape)

(701787, 5)


In [5]:
df.isnull().sum()

title             46
selftext       33691
created_utc        0
over_18            0
subreddit          0
dtype: int64

In [6]:
df = df.dropna(subset=['selftext'], how='any')

In [7]:
df.isnull().sum()

title          42
selftext        0
created_utc     0
over_18         0
subreddit       0
dtype: int64

In [8]:
df['subreddit'].value_counts()

BPD              233125
Anxiety          167059
depression       156717
bipolar           46666
mentalillness     44249
schizophrenia     20280
Name: subreddit, dtype: int64

In [10]:
df['title'] = df['title'].fillna('')

# Calculate the total number of words in 'title'
df['title_total'] = df['title'].apply(lambda x: len(x.split()))

# Define a function to count total characters in a text (excluding spaces)
def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

# Calculate the total number of characters in 'title'
df['title_chars'] = df['title'].apply(count_total_words)

In [11]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21
4,help,[removed],1650350907,False,BPD,1,4


In [12]:
df['text_total'] = df['selftext'].apply(lambda x: len(x.split()))

def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

df['text_chars'] = df["selftext"].apply(count_total_words)

In [13]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21,821,3282
4,help,[removed],1650350907,False,BPD,1,4,1,9


### Data Downsizing

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668096 entries, 0 to 701786
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        668096 non-null  object
 1   selftext     668096 non-null  object
 2   created_utc  668096 non-null  int64 
 3   over_18      668096 non-null  bool  
 4   subreddit    668096 non-null  object
 5   title_total  668096 non-null  int64 
 6   title_chars  668096 non-null  int64 
 7   text_total   668096 non-null  int64 
 8   text_chars   668096 non-null  int64 
dtypes: bool(1), int64(5), object(3)
memory usage: 46.5+ MB


The number of data is 666,8096. It is very large and takes a lot of time to process. As we wish to spotlight the posts published during the duration of the COVID-19 pandemic, we will be limiting our data to only include posts from March 2020 onwards. A random sample of 10,000 posts will be taken from the dataset for efficiency.

In [15]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [16]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,2022-04-19 08:29:20,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,2022-04-19 08:24:20,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,2022-04-19 08:02:59,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",2022-04-19 07:30:30,False,BPD,3,21,821,3282
4,help,[removed],2022-04-19 06:48:27,False,BPD,1,4,1,9


In [17]:
# Filter posts from March 2020 onwards
filtered_df = df[df['created_utc'] >= '2020-03-01']

# Take a random sample of 10,000 posts
sampled_df = filtered_df.sample(n=1000, random_state=42)

In [18]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,mentalillness,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,depression,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,depression,13,50,1,9


In [19]:
sampled_df['subreddit'].value_counts()

depression       297
Anxiety          275
BPD              247
mentalillness     77
bipolar           72
schizophrenia     32
Name: subreddit, dtype: int64

### Recategorizing 'subreddit'

In [20]:
# def mental_disorders(ex):
#     if ex == 'BPD':
#         return 'BPD'
#     elif ex == 'bipolar':
#         return 'bipolar'
#     elif ex == 'Anxiety':
#         return 'anxiety'
#     elif ex == 'schizophrenia':
#         return 'schizophrenia'
#     elif ex == 'depression':
#         return 'depression'
#     else:
#         return 'others'

def mental_disorders(ex):
    if ex=='BPD':
        return 'BPD'
    elif ex=='bipolar':
        return 'bipolar'
    else:
        return 'others'

In [21]:
sampled_df['subreddit'] = sampled_df['subreddit'].apply(mental_disorders)

In [22]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,others,13,50,1,9
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,others,6,23,113,506
293850,Breakup depression and self isolated without r...,[removed],2022-08-06 09:23:21,False,others,7,49,1,9
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,others,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757


In [23]:
# We will remove the rows under selftext with have '[removed]'

sampled_df = sampled_df[sampled_df['selftext'] != '[removed]']

In [24]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,others,6,23,113,506
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,others,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757
443776,Spiraling out of control,Do you ever get to where you feel fine one min...,2022-05-09 02:03:26,True,others,4,21,146,650
87944,Pms exacerbating neediness for fp,I've been working hard with my therapist on co...,2021-01-18 00:09:14,False,BPD,5,29,90,381


### Text Pre-Processing

In [25]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

string.punctuation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [26]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,others,6,23,113,506


In [27]:
sampled_df['all_text'] = sampled_df['title'] + " " + sampled_df['selftext']

df = sampled_df[['all_text', 'subreddit']]
df = df[df['subreddit'] != 'others']

df.head()

Unnamed: 0,all_text,subreddit
131450,Looking for hope (feeling fed up) My diagnosis...,BPD
275676,"memory flashes so, I used to have a really goo...",bipolar
87944,Pms exacerbating neediness for fp I've been wo...,BPD
115086,Is anyone on amitriptyline? What’s your experi...,BPD
140144,A mom learning a lot about BPD and DBT. My 17y...,BPD


In [28]:
# Define the abbreviations dictionary
abbr_dict = {
    "'cause": "because",
    "ain't": "am not",
    "can't": "can not",
    "cannot": "can not",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "gimme": "give me",
    "gotta": "got to",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "here's": "here is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'm": "i am",
    "i'll": "i will",
    "i've": "i have",
    "i ve": "i have",
    "imma": "i am going to",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "lemme": "let me",
    "let's": "let us",
    "not've": "not have",
    "shouldn't": "should not",
    "she'll": "she will",
    "she's": "she is",
    "that's": "that is",
    "there's": "there is",
    "there're": "there are",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "werent": "were not",
    "what's": "what is",
    "what're": "what are",
    "when's": "when is",
    "when're": "when are",
    "where's": "where is",
    "where're": "where are",
    "who's": "who is",
    "who're": "who are",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

# Define the function to replace the abbreviations
def replace_abbreviations(text):
    # Replace '’' with '\'
    text = re.sub('’', '\'', text)

    # Remove any word that starts with 'm' or 'f' followed by digits
    text = re.sub(r'\b[mf](\d+)\b', '', text, flags=re.IGNORECASE)

    # Remove any digit that is followed by 'm' or 'f'
    text = re.sub(r'\b(\d+)[mf]\b', '', text, flags=re.IGNORECASE)

    # Replace abbreviations with their full form
    for word in text.split():
        if word.lower() in abbr_dict:
            text = re.sub(r'\b{}\b'.format(word), abbr_dict[word.lower()], text, flags=re.IGNORECASE)
    return text

# Define the function to remove emojis
def remove_emojis(text):
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoji, '', text)

def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

def remove_whitespaces(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

def remove_digits(text):
    return re.sub(r'\d+', '', text)

In [29]:
# Cleaning and tokenization
def tokenization(text):
    set_stop_words = set(stopwords.words('english'))

    text = replace_abbreviations(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = remove_emojis(text)
    text = remove_html(text)
    text = remove_whitespaces(text)
    text = remove_digits(text)
    tokens = word_tokenize(text)

    return [w for w in tokens if w not in set_stop_words]

df['tokens']= df['all_text'].apply(lambda x: tokenization(x))
df.head()

Unnamed: 0,all_text,subreddit,tokens
131450,Looking for hope (feeling fed up) My diagnosis...,BPD,"[looking, hope, feeling, fed, diagnosis, fairl..."
275676,"memory flashes so, I used to have a really goo...",bipolar,"[memory, flashes, used, really, good, memory, ..."
87944,Pms exacerbating neediness for fp I've been wo...,BPD,"[pms, exacerbating, neediness, fp, working, ha..."
115086,Is anyone on amitriptyline? What’s your experi...,BPD,"[anyone, amitriptyline, experience, amitriptyl..."
140144,A mom learning a lot about BPD and DBT. My 17y...,BPD,"[mom, learning, lot, bpd, dbt, yr, old, daught..."


In [31]:
# Lemmatization
word_lemmatizer = WordNetLemmatizer()

def lemmatization(text):
    lemm_text = [word_lemmatizer.lemmatize(word, pos="v") for word in text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="n") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="a") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="r") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="s") for word in lemm_text]
    return lemm_text

df['lemmatized_tokens'] = df['tokens'].apply(lambda x:lemmatization(x))
df.head(20)

Unnamed: 0,all_text,subreddit,tokens,lemmatized_tokens
131450,Looking for hope (feeling fed up) My diagnosis...,BPD,"[looking, hope, feeling, fed, diagnosis, fairl...","[look, hope, feel, feed, diagnosis, fairly, ne..."
275676,"memory flashes so, I used to have a really goo...",bipolar,"[memory, flashes, used, really, good, memory, ...","[memory, flash, use, really, good, memory, amp..."
87944,Pms exacerbating neediness for fp I've been wo...,BPD,"[pms, exacerbating, neediness, fp, working, ha...","[pm, exacerbate, neediness, fp, work, hard, th..."
115086,Is anyone on amitriptyline? What’s your experi...,BPD,"[anyone, amitriptyline, experience, amitriptyl...","[anyone, amitriptyline, experience, amitriptyl..."
140144,A mom learning a lot about BPD and DBT. My 17y...,BPD,"[mom, learning, lot, bpd, dbt, yr, old, daught...","[mom, learn, lot, bpd, dbt, yr, old, daughter,..."
66199,Just received my diagnosis. Now what? My docto...,BPD,"[received, diagnosis, doctor, gave, bpd, diagn...","[receive, diagnosis, doctor, give, bpd, diagno..."
27126,I have a thing for younger guys First off I’d ...,BPD,"[thing, younger, guys, first, like, state, im,...","[thing, young, guy, first, like, state, im, so..."
145330,I'm scared to get therapy I contacted a couple...,BPD,"[scared, get, therapy, contacted, couple, ther...","[scar, get, therapy, contact, couple, therapis..."
89838,I have so much I need to but all I can do is s...,BPD,"[much, need, sleep, hi, bad, predicament, know...","[much, need, sleep, hi, bad, predicament, know..."
26261,Well I’m off the rails again It’s all going do...,BPD,"[well, rails, going, downhill, flip, switched,...","[well, rail, go, downhill, flip, switch, insid..."


In [32]:
from collections import defaultdict
import copy

# Counting the unique number of tokens for num_words in text_encoding

lemmatized_words = [word for word_list in df['lemmatized_tokens'] for word in word_list]
unique_words = len(set(lemmatized_words))

# Encoding and padding

def text_encoding(lemmatized_texts, num_words):
    vocabulary = defaultdict(int)
    fdist = nltk.FreqDist()

    all_lemmatized_words = [word for word_list in lemmatized_texts for word in word_list]
    
    for word in all_lemmatized_words:
        fdist[word] += 1

    common_words = fdist.most_common(num_words)

    for idx, word in enumerate(common_words):
        vocabulary[word[0]] = (idx + 1)

    encoded_texts = []
    texts4encoding = []

    for tokens in lemmatized_texts:
        temp_codes = []
        temp_words = []

        for word in tokens:
            if word in vocabulary.keys():
                temp_codes.append(vocabulary[word])
                temp_words.append(word)

        encoded_texts.append(temp_codes)
        texts4encoding.append(temp_words)

    vector_size = max(len(x) for x in encoded_texts)

    return encoded_texts, texts4encoding, vector_size

def codes_padding(X_encoded_texts):
    pad_value = 0
    padded_codes = []

    codes_from_texts = copy.deepcopy(X_encoded_texts)
    
    # vector_size in text_encoding
    max_length = max(len(encoded_text) for encoded_text in codes_from_texts)

    for encoded_text in codes_from_texts:
        while len(encoded_text) < max_length:
            encoded_text.append(pad_value)
        padded_codes.append(encoded_text)

    return padded_codes

df['padded_encoding'] = codes_padding(text_encoding(df['lemmatized_tokens'], unique_words)[0])

df.head(20)

Unnamed: 0,all_text,subreddit,tokens,lemmatized_tokens,padded_encoding
131450,Looking for hope (feeling fed up) My diagnosis...,BPD,"[looking, hope, feeling, fed, diagnosis, fairl...","[look, hope, feel, feed, diagnosis, fairly, ne...","[56, 179, 1, 1040, 159, 1041, 126, 27, 303, 18..."
275676,"memory flashes so, I used to have a really goo...",bipolar,"[memory, flashes, used, really, good, memory, ...","[memory, flash, use, really, good, memory, amp...","[535, 875, 73, 11, 21, 535, 354, 590, 385, 170..."
87944,Pms exacerbating neediness for fp I've been wo...,BPD,"[pms, exacerbating, neediness, fp, working, ha...","[pm, exacerbate, neediness, fp, work, hard, th...","[1048, 1847, 1049, 74, 15, 65, 235, 328, 72, 7..."
115086,Is anyone on amitriptyline? What’s your experi...,BPD,"[anyone, amitriptyline, experience, amitriptyl...","[anyone, amitriptyline, experience, amitriptyl...","[26, 1320, 60, 1320, 95, 115, 387, 3, 1850, 11..."
140144,A mom learning a lot about BPD and DBT. My 17y...,BPD,"[mom, learning, lot, bpd, dbt, yr, old, daught...","[mom, learn, lot, bpd, dbt, yr, old, daughter,...","[236, 193, 49, 12, 303, 1857, 185, 662, 284, 8..."
66199,Just received my diagnosis. Now what? My docto...,BPD,"[received, diagnosis, doctor, gave, bpd, diagn...","[receive, diagnosis, doctor, give, bpd, diagno...","[1056, 159, 389, 78, 12, 159, 463, 303, 38, 53..."
27126,I have a thing for younger guys First off I’d ...,BPD,"[thing, younger, guys, first, like, state, im,...","[thing, young, guy, first, like, state, im, so...","[19, 282, 83, 90, 2, 252, 52, 218, 1863, 756, ..."
145330,I'm scared to get therapy I contacted a couple...,BPD,"[scared, get, therapy, contacted, couple, ther...","[scar, get, therapy, contact, couple, therapis...","[97, 3, 150, 271, 194, 235, 239, 357, 1332, 59..."
89838,I have so much I need to but all I can do is s...,BPD,"[much, need, sleep, hi, bad, predicament, know...","[much, need, sleep, hi, bad, predicament, know...","[33, 29, 110, 465, 30, 1878, 5, 272, 1335, 466..."
26261,Well I’m off the rails again It’s all going do...,BPD,"[well, rails, going, downhill, flip, switched,...","[well, rail, go, downhill, flip, switch, insid...","[69, 1883, 6, 1884, 1066, 595, 332, 64, 63, 54..."


### Model 1: CNN

In [33]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [34]:
class TextClassificationCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassificationCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = nn.functional.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)

In [35]:
vocab_size = unique_words
embed_dim = 100
num_classes = df['subreddit'].nunique()

model = TextClassificationCNN(vocab_size, embed_dim, num_classes)

In [36]:
# Loss Function

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [37]:
num_epochs = 50
batch_size = 128

In [52]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Assuming df['lemmatized_tokens'] is a list of lists
lemmatized_tokens = df['lemmatized_tokens'].tolist()

# Filter out non-list elements
lemmatized_tokens = [tokens for tokens in lemmatized_tokens if isinstance(tokens, list)]

# Create a vocabulary for your tokens and assign an index to each unique token
vocab = {token: idx for idx, token in enumerate(set(token for tokens in lemmatized_tokens for token in tokens))}

# Convert each token to its corresponding index
X_data = [[vocab[token] for token in tokens] for tokens in lemmatized_tokens]

# Pad sequences to ensure equal length
X_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(tokens) for tokens in X_data], batch_first=True)

# Convert labels to tensor
y_data = torch.tensor(df['subreddit'].astype('category').cat.codes.tolist(), dtype=torch.long)

# Create a DataLoader for batching
dataset = TensorDataset(X_data, y_data)
batch_size = 128  # You can adjust this according to your needs
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Now, you can continue with the rest of your training loop
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        # Clear the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Ensure that labels have the correct dimensions (batch_size)
        labels = labels.squeeze(dim=1) if len(labels.size()) > 1 else labels

        # Calculate the loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

    print('Epoch {}: Loss: {:.4f}'.format(epoch + 1, loss.item()))

print('Training complete!')

Epoch 1: Loss: 0.5243
Epoch 2: Loss: 0.4921
Epoch 3: Loss: 0.4660
Epoch 4: Loss: 0.5890
Epoch 5: Loss: 0.4848
Epoch 6: Loss: 0.3641
Epoch 7: Loss: 0.6091
Epoch 8: Loss: 0.5017
Epoch 9: Loss: 0.3646
Epoch 10: Loss: 0.5293
Epoch 11: Loss: 0.5462
Epoch 12: Loss: 0.4041
Epoch 13: Loss: 0.4715
Epoch 14: Loss: 0.4221
Epoch 15: Loss: 0.6326
Epoch 16: Loss: 0.4757
Epoch 17: Loss: 0.4973
Epoch 18: Loss: 0.5588
Epoch 19: Loss: 0.4763
Epoch 20: Loss: 0.6429
Epoch 21: Loss: 0.4644
Epoch 22: Loss: 0.5639
Epoch 23: Loss: 0.4535
Epoch 24: Loss: 0.4305
Epoch 25: Loss: 0.3625
Epoch 26: Loss: 0.6861
Epoch 27: Loss: 0.6048
Epoch 28: Loss: 0.5212
Epoch 29: Loss: 0.6750
Epoch 30: Loss: 0.4431
Epoch 31: Loss: 0.5373
Epoch 32: Loss: 0.5439
Epoch 33: Loss: 0.5053
Epoch 34: Loss: 0.6351
Epoch 35: Loss: 0.4503
Epoch 36: Loss: 0.6506
Epoch 37: Loss: 0.5152
Epoch 38: Loss: 0.5195
Epoch 39: Loss: 0.5610
Epoch 40: Loss: 0.4567
Epoch 41: Loss: 0.3440
Epoch 42: Loss: 0.4986
Epoch 43: Loss: 0.3943
Epoch 44: Loss: 0.75

### CNN Model Evaluation

In [54]:
import torchmetrics
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
from torchmetrics import F1Score

In [55]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Switch model to evaluation mode
model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        # Forward pass
        outputs = model(inputs)

        # Calculate predictions and convert to numpy arrays
        predictions = torch.argmax(outputs, dim=1).cpu().numpy()
        labels = labels.cpu().numpy()

        # Append to the lists
        all_predictions.extend(predictions)
        all_labels.extend(labels)

# Convert lists to numpy arrays
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Print metrics
print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))

Accuracy: 0.7942
Precision: 0.6308
Recall: 0.7942
F1 Score: 0.7031
