# Setup

In [1]:
import numpy as np
import pandas as pd

# Data Preparation

## Initial Look & Keep Only Text, Target Label Columns

In [2]:
raw_df = pd.read_csv('data/Stress.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subreddit         2838 non-null   object 
 1   post_id           2838 non-null   object 
 2   sentence_range    2838 non-null   object 
 3   text              2838 non-null   object 
 4   label             2838 non-null   int64  
 5   confidence        2838 non-null   float64
 6   social_timestamp  2838 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 155.3+ KB


In [3]:
raw_df.head(2)

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817


In [4]:
raw_df.sample(n=5)

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
2392,survivorsofabuse,61vztv,"(25, 30)",It turns out they had been giving our grandpar...,0,0.8,1490659859
2209,stress,8b9j8h,"(5, 10)",Your responses to the survey items will be ano...,0,1.0,1523381550
1436,domesticviolence,9f9x15,"[25, 30]",While he constantly texts and asks what I'm do...,0,0.571429,1536774082
2757,anxiety,70o1bt,"[0, 5]",I'm 26. Tuesday is day one of therapy. Day one...,1,1.0,1505661182
405,relationships,7s6fc2,"(0, 5)",I'll try to keep this short and succint so it'...,0,1.0,1516634575


In [5]:
ignored_cols = ['subreddit','post_id','sentence_range','confidence','social_timestamp']
df = raw_df.drop(columns=ignored_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2838 non-null   object
 1   label   2838 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 44.5+ KB


Relatively balanced starting dataset

In [6]:
df['label'].value_counts()

label
1    1488
0    1350
Name: count, dtype: int64

## Text Processing

from spacy import load --> errors

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from urllib.parse import urlparse

In [10]:
nltk.download('omw-1.4') # Open Multilingual Wordnet, this is an lexical database 
nltk.download('wordnet') 
nltk.download('wordnet2022')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data] Downloading package wordnet2022 to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Unzipping corpora/wordnet2022.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yichenzhang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
def process_text(sent):
    """Use RegEx to clean raw text data"""
    try:
        # brackets replacing by space
        sent = re.sub('[][)(]',' ',sent)

        # url removing
        sent = [word for word in sent.split() if not urlparse(word).scheme]
        sent = ' '.join(sent)

        # removing escap characters
        sent = re.sub(r'\@\w+','',sent)

        # removing html tags 
        sent = re.sub(re.compile("<.*?>"),'',sent)

        # getting only characters and numbers from text
        sent = re.sub("[^A-Za-z0-9]",' ',sent)

        # lower case all words
        sent = sent.lower()
        
        # strip all words from sentences
        sent = [word.strip() for word in sent.split()]
        sent = ' '.join(sent)

        # word tokenization
        tokens = word_tokenize(sent)
        
        # removing words which are in stopwords
        for word in tokens:
            if word in stop_words:
                tokens.remove(word)
        
        # lemmatization
        sent = [lemmatizer.lemmatize(word) for word in tokens]
        sent = ' '.join(sent)
        return sent
    
    except Exception as ex:
        print(sent,"\n")
        print("Error ",ex)

In [14]:
df['processed_text'] = df['text'].apply(lambda text: process_text(text))
df.head(2)

Unnamed: 0,text,label,processed_text
0,"He said he had not felt that way before, sugge...",1,said had felt way suggeted go rest so trigger ...
1,"Hey there r/assistance, Not sure if this is th...",0,hey r assistance sure right place post but go ...


In [15]:
from random import randint 

rand_index = randint(0, len(df)-1)
rand_index

1644

Not all the stopwords are being removed - possibility to improve here?

In [16]:
print("without process ---> ",df['text'].iloc[rand_index],end='\n\n')
print("after process ---> ",df['processed_text'].iloc[rand_index])

without process --->  >Like, I budget and we are responsible but the rent and bills and then gas and medical costs are so high we just cant do it >Im even studying coding to try and fight out of poverty but even thats tough because I am always at a physically demanding job >And the "good" resources cost money I dont have >I havent seen my family in two years. I have a niece growing up not knowing me because I cant afford to even miss a day of work let alone a week to visit them

after process --->  like budget responsible the rent bill then gas medical cost so high we just cant it im even studying coding try fight poverty even thats tough always physically demanding job the good resource cost money dont havent seen family two year i have niece growing not knowing because i cant afford even miss day of work let alone a week visit


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
MIN_DF = 0.01 # Used to remove terms that appear too infrequently (here < 1% of docs)

In [18]:
cv = CountVectorizer(min_df=MIN_DF)
cv_df = cv.fit_transform(df['processed_text'])
cv_df.toarray()

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 2],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
cv_df = pd.DataFrame(cv_df.toarray(),columns=cv.get_feature_names_out())
cv_df.head(3)

Unnamed: 0,10,100,12,15,18,20,30,able,about,absolutely,...,writing,wrong,x200b,year,yes,yesterday,yet,you,young,your
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tf = TfidfVectorizer(min_df=MIN_DF)
tf_df = tf.fit_transform(df['processed_text'])
tf_df.toarray()

array([[0.        , 0.        , 0.        , ..., 0.11081108, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08052805, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.09993405, 0.        , 0.        , ..., 0.        , 0.11013288,
        0.17343281],
       [0.        , 0.        , 0.        , ..., 0.12818576, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [22]:
tf_df = pd.DataFrame(tf_df.toarray(),columns=tf.get_feature_names_out())
tf_df.head(3)

Unnamed: 0,10,100,12,15,18,20,30,able,about,absolutely,...,writing,wrong,x200b,year,yes,yesterday,yet,you,young,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110811,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080528,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.071966,0.0,0.150222,0.0,0.0,0.0,0.0


In [23]:
tf_df.describe()

Unnamed: 0,10,100,12,15,18,20,30,able,about,absolutely,...,writing,wrong,x200b,year,yes,yesterday,yet,you,young,your
count,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,...,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0,2838.0
mean,0.004721,0.002462,0.00218,0.002766,0.00363,0.002963,0.002974,0.009523,0.00998,0.002895,...,0.003229,0.005138,0.003447,0.024714,0.002678,0.002189,0.004199,0.022016,0.002794,0.008884
std,0.030589,0.024067,0.022581,0.024958,0.027127,0.025976,0.024819,0.040154,0.038116,0.024772,...,0.029075,0.031476,0.034415,0.056172,0.024287,0.021551,0.028466,0.067082,0.023226,0.045767
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.359104,0.426,0.464568,0.302589,0.488688,0.381681,0.303773,0.37126,0.382298,0.301222,...,0.551892,0.338648,0.586544,0.406039,0.387032,0.331589,0.397938,0.621206,0.340199,0.625197


In [24]:
cv_df.shape

(2838, 778)

In [25]:
tf_df.shape

(2838, 778)