# Import Libararies

In [2]:
#!pip install jupyterlab_widgets

# PyTorch
#!pip install torch torchvision torchaudio

# Optional: Hugging Face Transformers
#!pip install transformers

In [3]:
# OR TensorFlow
#!pip install tensorflow

In [4]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

df = pd.read_csv("Combined Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [5]:
# 1. Quick summary: count of missing values per column
print(df.isnull().sum())

# 2. Percentage of missing values per column
print((df.isnull().mean() * 100).round(2))

# 3. Total number of missing values in the entire DataFrame
print(df.isnull().sum().sum())


Unnamed: 0      0
statement     362
status          0
dtype: int64
Unnamed: 0    0.00
statement     0.68
status        0.00
dtype: float64
362


In [6]:
df = df.dropna(subset=['statement'])
print(df['statement'].isnull().sum())  # Should be 0

0


# Text Preprocessing

In [7]:
# Clean text column
df['statement'] = df['statement'].astype(str)  # Force all values to string
df = df[df['statement'].notnull()]        # Remove rows where text is NaN

In [8]:
# Download necessary NLTK resources (do it once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# Processing Steps
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Yap Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Yap Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Yap
[nltk_data]     Jack\

True

In [9]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Create stopwords set but keep negative and pronouns
stop_words = set(stopwords.words('english'))
negation_words = {"no", "nor", "not", "don", "don't", "ain", "aren", "aren't",
                  "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't",
                  "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't",
                  "isn", "isn't", "mightn", "mightn't", "mustn", "mustn't",
                  "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't",
                  "wasn", "wasn't", "weren", "weren't", "won", "won't",
                  "wouldn", "wouldn't"}

pronouns_to_keep = {'i', 'you', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}

stop_words = stop_words - negation_words - pronouns_to_keep

# Expand contractions function
def expand_contractions(text):
    contractions = {
        "i'm": "i am", "i've": "i have", "i'll": "i will", "don't": "do not",
        "can't": "cannot", "won't": "will not", 
        "it isn't": "it is not", "isn't it": "is it not",
        "wasn't it": "was it not", "it wasn't": "it was not",
        "they aren't": "they are not", "aren't they": "are they not",
        "aren't we": "are we not", "we aren't": "we are not", "aren't i": "am i not",
        "weren't they": "were they not", "they weren't": "they were not",
        "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is",
        "we're": "we are", "they're": "they are"
    }

    # Create regex pattern with word boundaries, ignore case
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, contractions.keys())) + r')\b', flags=re.IGNORECASE)

    def match_case(original, replacement):
        if original.isupper():
            return replacement.upper()
        elif original[0].isupper():
            return replacement.capitalize()
        else:
            return replacement

    def replace(match):
        original = match.group(0)
        contraction = original.lower()
        expanded = contractions.get(contraction, contraction)
        return match_case(original, expanded)

    return pattern.sub(replace, text)


# Helper to convert nltk POS tags to WordNet POS tags for lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def preprocess_text(text):
    text = expand_contractions(text)

    # Sentence tokenize
    sentences = sent_tokenize(text)

    # Remove non-alphanumeric characters (excluding apostrophes)
    cleaned_sentences = [re.sub(r"[^a-zA-Z0-9\s']", " ", s) for s in sentences]

    # Tokenization
    tokens = []
    for s in cleaned_sentences:
        tokens.extend(word_tokenize(s))

    # POS tagging on original tokens (before case folding)
    # I acknowledge that spaCy provides more accurate POS tagging compared to NLTK's pos_tag.
    # However, due to resource constraints and familiarity with NTLK, pos_tag is used here.
    pos_tags = nltk.pos_tag(tokens)

    # Case folding AFTER POS tagging
    pos_tags = [(word.lower(), tag) for word, tag in pos_tags]

    # Lemmatization using POS
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]

    # Stopword filtering
    filtered = [word for word in lemmatized if word not in stop_words]

    return ' '.join(filtered)

# Sample test
sample_text = "I'm not sure if you're going to like this! I've been feeling quite down lately — nothing seems to work, and I can't say I'm happy. However, I don't want to give up. Isn't it strange how life changes so quickly? We're trying our best, aren't we? Don't let anyone tell you otherwise."

cleaned_text = preprocess_text(sample_text)
print("\nFinal preprocessed text:\n", cleaned_text)


Final preprocessed text:
 i not sure you go like i feel quite lately nothing seem work i not say i happy however i not want give not strange life change quickly we try best we not not let anyone tell you otherwise


In [10]:
# This code is only for testing and debugging purposes 

"""
def preprocess_text(text):
    print("Original Text:\n", text, "\n")

    # 1. Expand contractions
    text = expand_contractions(text)
    print("After Contraction Expansion:\n", text, "\n")

    # 2. Sentence tokenization
    sentences = sent_tokenize(text)
    print("Sentences:\n", sentences, "\n")

    # 3. Remove non-alphanumeric characters (except apostrophes)
    cleaned_sentences = [re.sub(r"[^a-zA-Z0-9\s']", " ", s) for s in sentences]
    print("After Removing Punctuation:\n", cleaned_sentences, "\n")

    # 4. Word tokenization
    tokens = []
    for s in cleaned_sentences:
        tokens.extend(word_tokenize(s))
    print("Tokens:\n", tokens, "\n")

    # 5. POS tagging
    pos_tags = nltk.pos_tag(tokens)
    print("POS Tags:\n", pos_tags, "\n")

    # 6. Case folding after POS tagging
    pos_tags = [(word.lower(), tag) for word, tag in pos_tags]
    print("POS Tags After Case Folding:\n", pos_tags, "\n")

    # 7. Lemmatization using POS
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    print("Lemmatized:\n", lemmatized, "\n")

    # 8. Remove stopwords (excluding negation and pronouns)
    filtered = [word for word in lemmatized if word not in stop_words]
    print("Filtered (No Stopwords):\n", filtered, "\n")

    # Final output
    final_output = ' '.join(filtered)
    print("Final Output:\n", final_output, "\n")
    return final_output

# Sample test
sample_text = "I'm not sure if you're going to like this! I've been feeling quite down lately — nothing seems to work, and I can't say I'm happy. However, I don't want to give up. Isn't it strange how life changes so quickly? We're trying our best, aren't we? Don't let anyone tell you otherwise."

cleaned_text = preprocess_text(sample_text)
print("\nFinal preprocessed text:\n", cleaned_text)
"""

'\ndef preprocess_text(text):\n    print("Original Text:\n", text, "\n")\n\n    # 1. Expand contractions\n    text = expand_contractions(text)\n    print("After Contraction Expansion:\n", text, "\n")\n\n    # 2. Sentence tokenization\n    sentences = sent_tokenize(text)\n    print("Sentences:\n", sentences, "\n")\n\n    # 3. Remove non-alphanumeric characters (except apostrophes)\n    cleaned_sentences = [re.sub(r"[^a-zA-Z0-9\\s\']", " ", s) for s in sentences]\n    print("After Removing Punctuation:\n", cleaned_sentences, "\n")\n\n    # 4. Word tokenization\n    tokens = []\n    for s in cleaned_sentences:\n        tokens.extend(word_tokenize(s))\n    print("Tokens:\n", tokens, "\n")\n\n    # 5. POS tagging\n    pos_tags = nltk.pos_tag(tokens)\n    print("POS Tags:\n", pos_tags, "\n")\n\n    # 6. Case folding after POS tagging\n    pos_tags = [(word.lower(), tag) for word, tag in pos_tags]\n    print("POS Tags After Case Folding:\n", pos_tags, "\n")\n\n    # 7. Lemmatization using POS

In [11]:
df['cleaned_text'] = df['statement'].apply(preprocess_text)
df[['statement', 'cleaned_text', 'status']]


Unnamed: 0,statement,cleaned_text,status
0,oh my gosh,oh gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",trouble sleep confuse mind restless heart tune,Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",wrong back dear forward doubt stay restless re...,Anxiety
3,I've shifted my focus to something else but I'...,i shift focus something else i still worried,Anxiety
4,"I'm restless and restless, it's been a month n...",i restless restless month boy you mean,Anxiety
...,...,...,...
53038,Nobody takes me seriously I’ve (24M) dealt wit...,nobody take me seriously i 24m dealt depressio...,Anxiety
53039,"selfishness ""I don't feel very good, it's lik...",selfishness i not feel good like i not belong ...,Anxiety
53040,Is there any way to sleep better? I can't slee...,way sleep good i not sleep night med n't help,Anxiety
53041,"Public speaking tips? Hi, all. I have to give ...",public speak tip hi i give presentation work n...,Anxiety


# Exploratory Data Analysis

In [13]:
from collections import Counter

print(Counter(df["status"]))

Counter({'Normal': 16343, 'Depression': 15404, 'Suicidal': 10652, 'Anxiety': 3841, 'Bipolar': 2777, 'Stress': 2587, 'Personality disorder': 1077})


# Text Processing 

In [18]:
# !pip install textblob
# !pip install vaderSentiment
# !pip install afinn
# !pip install pywsd nltk
# !pip install spacy

## Named Entity Recognition (NER)

In [27]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df["named_entities"] = df["statement"].apply(extract_ner)

## Word Sense Disambiguation (WSD)

In [29]:
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk

nltk.download('punkt')
nltk.download('wordnet')

def disambiguate(text, word):
    tokens = word_tokenize(text)
    sense = lesk(tokens, word)
    return sense.definition() if sense else None

# Example: Apply WSD to the word "stress" (you can adjust this)
df["wsd_stress"] = df["statement"].apply(lambda x: disambiguate(x, "stress"))


[nltk_data] Downloading package punkt to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Sentiment Lexicon

In [21]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

df["score"] = df["cleaned_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

df = df[['statement','cleaned_text','score','status']]
df

[nltk_data] Downloading package vader_lexicon to C:\Users\Yap
[nltk_data]     Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0.1,Unnamed: 0,statement,status,cleaned_text,score
0,0,oh my gosh,Anxiety,oh gosh,0.0000
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleep confuse mind restless heart tune,-0.6908
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,-0.7351
3,3,I've shifted my focus to something else but I'...,Anxiety,i shift focus something else i still worried,-0.2960
4,4,"I'm restless and restless, it's been a month n...",Anxiety,i restless restless month boy you mean,-0.4939
...,...,...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety,nobody take me seriously i 24m dealt depressio...,0.8696
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety,selfishness i not feel good like i not belong ...,-0.9830
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety,way sleep good i not sleep night med n't help,0.1635
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety,public speak tip hi i give presentation work n...,-0.6249


## Polarity

In [26]:
sia = SentimentIntensityAnalyzer()

df["score"] = df["cleaned_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

df = df[['statement','cleaned_text','score','status']]
df

from textblob import TextBlob

blob = TextBlob()

df["Polarity"] = df["cleaned_text"].apply(lambda x: sia.polarity_scores(x)["compound"])
print("Polarity:", blob.sentiment.polarity)
print("Subjectivity:", blob.sentiment.subjectivity)


Unnamed: 0,statement,cleaned_text,score,status
0,oh my gosh,oh gosh,0.0000,Anxiety
1,"trouble sleeping, confused mind, restless hear...",trouble sleep confuse mind restless heart tune,-0.6908,Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",wrong back dear forward doubt stay restless re...,-0.7351,Anxiety
3,I've shifted my focus to something else but I'...,i shift focus something else i still worried,-0.2960,Anxiety
4,"I'm restless and restless, it's been a month n...",i restless restless month boy you mean,-0.4939,Anxiety
...,...,...,...,...
53038,Nobody takes me seriously I’ve (24M) dealt wit...,nobody take me seriously i 24m dealt depressio...,0.8696,Anxiety
53039,"selfishness ""I don't feel very good, it's lik...",selfishness i not feel good like i not belong ...,-0.9830,Anxiety
53040,Is there any way to sleep better? I can't slee...,way sleep good i not sleep night med n't help,0.1635,Anxiety
53041,"Public speaking tips? Hi, all. I have to give ...",public speak tip hi i give presentation work n...,-0.6249,Anxiety


## Sentence Sentiment Classification

In [30]:
from textblob import TextBlob

def get_sentiment_label(text):
    score = TextBlob(text).sentiment.polarity
    if score > 0.1:
        return "positive"
    elif score < -0.1:
        return "negative"
    else:
        return "neutral"

df["sentiment_label"] = df["statement"].apply(get_sentiment_label)


# Feature Selection

# Model Construction