# Based on [This](https://www.kaggle.com/code/vijay20213/stress-identification-nlp-with-best-prediction) Kaggle Notebook

# Setup

In [1]:
import numpy as np
import pandas as pd

# Data Preparation

## Initial Look & Keep Only Text, Target Label Columns

In [2]:
raw_df = pd.read_csv('data/Stress.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subreddit         2838 non-null   object 
 1   post_id           2838 non-null   object 
 2   sentence_range    2838 non-null   object 
 3   text              2838 non-null   object 
 4   label             2838 non-null   int64  
 5   confidence        2838 non-null   float64
 6   social_timestamp  2838 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 155.3+ KB


In [3]:
raw_df.head(2)

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817


In [4]:
raw_df.sample(n=5)

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
1601,relationships,7sdyx6,"[100, 105]","Back to my dad, during lunch he wanted to say ...",0,0.571429,1516709505
2447,relationships,7o78rj,"(0, 5)",I’m a Canadian traveling in India and staying ...,0,1.0,1515111891
1471,ptsd,9vaxit,"[0, 5]",November 8 is the anniversary of when I was ki...,1,0.8,1541690897
2589,homeless,7r7wqs,"(11, 16)",I got as far as Illinois when he ghosted me af...,1,0.666667,1516258116
1956,relationships,7pg3a7,"(55, 60)",She didn’t have room for it where she was curr...,0,1.0,1515596406


In [7]:
raw_df.loc[1601, 'text']

"Back to my dad, during lunch he wanted to say some words to my GF, to encourage her to keep studying the doctorate, he said that if she felt that money was an issue, that don't let her that take her down, that doctorates in our country earn very poorly but that getting that degree would open doors to work abroad and land a job she would love. You see, my dad didn't know (well, I've told him plenty of times, but I figured he forgot at the time) that my GF's mom was a doctorate. My GF's mom openly told in front of my that she didn't earn enough for the years of study and work she did, everybody in our country knows that doctorates don't earn enough money for the effort, capacity and dedication they're required to do/have. But when my dad said those words, nobody said anything. I didn't take it as offensive."

In [8]:
ignored_cols = ['subreddit','post_id','sentence_range','confidence','social_timestamp']
df = raw_df.drop(columns=ignored_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2838 non-null   object
 1   label   2838 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 44.5+ KB


Relatively balanced starting dataset

In [9]:
df['label'].value_counts()

label
1    1488
0    1350
Name: count, dtype: int64

## Text Processing

### Packages Needed

from spacy import load --> errors

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from urllib.parse import urlparse

In [11]:
nltk.download('omw-1.4') # Open Multilingual Wordnet, this is an lexical database 
nltk.download('wordnet') 
nltk.download('wordnet2022')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Pare Down Words - Token Level: Word

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))
'dont' in stop_words

False

In case of misspellings that leave out apostrophes:

In [15]:
print(len(stop_words))
for stopword in stop_words:
    if "'" in stopword:
        stop_words.append(re.sub("'", "", stopword))
print(len(stop_words))

179
205


In [22]:
stop_words = set(stop_words)
type(stop_words)

set

In [63]:
def process_text(sent, stopwords=stop_words):
    """Use RegEx to clean raw text data"""
    try:
        # brackets replacing by space
        sent = re.sub('[][)(]',' ',sent)

        # url removing
        sent = [word for word in sent.split() if not urlparse(word).scheme]
        sent = ' '.join(sent)

        # removing escap characters
        sent = re.sub(r"\@\w+", "", sent)

        # removing html tags 
        sent = re.sub(re.compile("<.*?>"),'',sent)

        # getting only characters and numbers from text
        sent = re.sub("[^A-Za-z0-9]",' ',sent)

        # lower case all words
        sent = sent.lower()
        
        # Remove extra whitespace between words
        sent = [word.strip() for word in sent.split()]
        sent = ' '.join(sent)

        # word tokenization
        tokens = word_tokenize(sent)
        
        # removing words which are in stopwords
        tokens = [t for t in tokens if t not in stopwords]
        
        # lemmatization
        sent = [lemmatizer.lemmatize(word) for word in tokens]
        sent = ' '.join(sent)
        return sent
    
    except Exception as ex:
        print(sent,"\n")
        print("Error ",ex)

In [25]:
df['processed_text'] = df['text'].apply(lambda text: process_text(text))
df.sample(n=3)

Unnamed: 0,text,label,processed_text
131,Dr. Fred Penzel's Articles <url> A leading exp...,0,dr fred penzel article leading expert ocd dr p...
654,"&#x200B; Hey everyone, Being that Hurricane Fl...",1,x200b hey everyone hurricane florence occurred...
1178,"We are, by far, the youngest people around mar...",0,far youngest people around marina named beauti...


In [31]:
'than' in stop_words

True

In [34]:
lemmatizer.lemmatize('successfully')

'successfully'

In [32]:
sample_idx = 131
df.loc[sample_idx, 'text']

'Dr. Fred Penzel\'s Articles <url> A leading expert on OCD, Dr. Penzel has a collection of great articles online based on years of successfully treating patients with OCD. Particularly good for highlighting how the "less-obvious" variants of OCD can be treated. Check out Article 12 "Ten Things You Need to Know to Overcome OCD" for starters. Top Tips: Carry out [Exposure and Response Prevention].'

In [33]:
df.loc[sample_idx, 'processed_text']

'dr fred penzel article leading expert ocd dr penzel collection great article online based year successfully treating patient ocd particularly good highlighting le obvious variant ocd treated check article 12 ten thing need know overcome ocd starter top carry exposure response prevention'

In [35]:
from random import randint 

rand_index = randint(0, len(df)-1)
rand_index

464

SEEMINGLY RESOLVED: Not all the stopwords are being removed - possibility to improve here?

In [36]:
print("without process ---> ",df['text'].iloc[rand_index],end='\n\n')
print("after process ---> ",df['processed_text'].iloc[rand_index])

without process --->  My ex thought that show was terrific inspiration. He used it to discuss “what if”s with his guy friends, who I hope had no idea how real he is about that shit. I called it “The Rape Along/Beat Along Show” for the longest time. I still do when Gramma isn’t listening. Last time I unexpectedly came across the DVD set (Grampy accidentally left it at my place on the kitchen table, and, yes, he’d been told not to leave it where I might see it)...

after process --->  ex thought show terrific inspiration used discus guy friend hope idea real shit called rape along beat along show longest time still gramma listening last time unexpectedly came across dvd set grampy accidentally left place kitchen table yes told leave might see


### Vectorize (Word Embedding)

Note: the reference notebook seems to have removed no words. We may not want to remove rare words, as they may be the most helpful in differentiating different texts, but we may not want to keep all the words, either, as they take up more resources.

| MIN_DF | Vocab Size |
| --- | --- |
| 0.01 | 666 |
| 0.001 | 3957 |
| 0.0001 | 10,126 |
| 0 | 10,126 |

In [64]:
MIN_DF = 0.001

#### Bag of Words (Count) Vectorizer

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
cv = CountVectorizer(min_df=MIN_DF)
cv_df = cv.fit_transform(df['processed_text'])
cv_df.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [56]:
cv_df = pd.DataFrame(cv_df.toarray(),columns=cv.get_feature_names_out())
cv_df.head(3)

Unnamed: 0,000,10,100,1000,10pm,10th,11,11th,12,120,...,york,young,younger,youngest,youth,youtube,yr,zero,zoloft,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Term Frequency, Inverse Document Frequency Vectorizer

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
tf = TfidfVectorizer(min_df=MIN_DF)
tf_df = tf.fit_transform(df['processed_text'])
tf_df.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.07822263, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [59]:
tf_df = pd.DataFrame(tf_df.toarray(),columns=tf.get_feature_names_out())
tf_df.head(3)

Unnamed: 0,000,10,100,1000,10pm,10th,11,11th,12,120,...,york,young,younger,youngest,youth,youtube,yr,zero,zoloft,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
tf_df.describe()

Unnamed: 0,000,10,100,1000,10pm,10th,11,11th,12,120,...,york,young,younger,youngest,youth,youtube,yr,zero,zoloft,zone
count,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,...,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0
mean,0.000677,0.003727,0.002011,0.000667,0.000229,0.000227,0.001028,0.000281,0.001725,0.000288,...,0.000279,0.002343,0.001621,0.000592,0.000308,0.000672,0.000891,0.000726,0.000812,0.000288
std,0.012414,0.02424,0.019533,0.012899,0.007261,0.007039,0.014251,0.007515,0.018149,0.007704,...,0.009003,0.019467,0.017246,0.011429,0.009648,0.012501,0.013893,0.012343,0.013497,0.008084
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.417446,0.34731,0.3238,0.349854,0.284812,0.247687,0.274153,0.218368,0.376203,0.220154,...,0.38347,0.269158,0.260087,0.272226,0.364786,0.373744,0.276237,0.253284,0.366865,0.322071


In [65]:
cv_df.describe()

Unnamed: 0,000,10,100,1000,10pm,10th,11,11th,12,120,...,york,young,younger,youngest,youth,youtube,yr,zero,zoloft,zone
count,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,...,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0
mean,0.003524,0.02537,0.012333,0.003524,0.001057,0.001057,0.005638,0.001409,0.010571,0.001409,...,0.001057,0.015504,0.009866,0.002819,0.001057,0.003524,0.004228,0.003524,0.004228,0.001409
std,0.059266,0.161694,0.119582,0.07016,0.032501,0.032501,0.074886,0.037523,0.105678,0.037523,...,0.032501,0.126388,0.105746,0.053028,0.032501,0.064942,0.064899,0.059266,0.070121,0.037523
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0


# Build Model

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [67]:
random_seed = randint(0, 50)
random_seed

37

## BOW Embedding

In [68]:
# Default is 3:1 train:test
X_train, X_test, y_train, y_test = train_test_split(
    cv_df, df['label'], random_state=random_seed, stratify=df['label']
 )
X_train.shape, y_train.shape

((2128, 3957), (710,))

In [69]:
X_test.shape, y_test.shape

((710, 3957), (2128,))

### Logistic Regression

In [70]:
# Definitely overfitting
model_lr = LogisticRegression().fit(X_train,y_train)
model_lr.score(X_train,y_train),model_lr.score(X_test,y_test)

(0.9948308270676691, 0.719718309859155)

### Naive Bayes

In [71]:
# Test accuracy better, but still overfitting
model_nb = MultinomialNB().fit(X_train,y_train)
model_nb.score(X_train,y_train),model_nb.score(X_test,y_test)

(0.8923872180451128, 0.7352112676056338)

### Random Forest

In [72]:
# Surprising non-performant compared to the others
model_rf = RandomForestClassifier().fit(X_train,y_train)
model_rf.score(X_train,y_train),model_rf.score(X_test,y_test)

(0.9985902255639098, 0.6957746478873239)

## TF-IDF Embedding

In [73]:
X_train_td, X_test_td, y_train_td, y_test_td = train_test_split(
    tf_df, df['label'], random_state=random_seed, stratify=df['label']
)
X_train_td.shape, y_train_td.shape

((2128, 3957), (2128,))

In [75]:
X_test_td.shape, y_test_td.shape

((710, 3957), (710,))

### Logistic Regression

In [76]:
lr_td = LogisticRegression().fit(X_train_td, y_train_td)
lr_td.score(X_test_td, y_test_td)

0.7253521126760564

### Naive Bayes

In [77]:
nb_td = MultinomialNB().fit(X_train_td, y_train_td)
nb_td.score(X_test_td, y_test_td)

0.7295774647887324

### Random Forest

In [78]:
rf_td = RandomForestClassifier().fit(X_train_td, y_train_td)
rf_td.score(X_test_td, y_test_td)

0.6929577464788732

# Confusion Matrix & Classification Report

In [79]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report

## Bag-of-Words Embedding

## TF-IDF Embedding