In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np 
from tqdm import tqdm  # library used for simple and convenient way to add progress bars to loops and iterable objects
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
import re
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Combined Data.csv')
df = df.iloc[:,1:]
df = df[df['statement'].isnull()==False]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


#  Word 2 Vec Implementation 

In [3]:
## pip install gensim  ##need to install this library if you have not already

In [4]:
lemmatizer=WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def process_text_lem(text):
    'Applying lemmetization and removing english stop words'
    cleaned_text = re.sub('[^a-zA-Z]', " ", text)
    cleaned_text = cleaned_text.lower()
    words = cleaned_text.split()
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    processed_text = " ".join(processed_words)
    return processed_text

In [5]:
batch_size = 10000
corpus = []

for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch = df.iloc[start:end]
    batch['processed_text'] = batch['statement'].apply(process_text_lem)
    corpus.extend(batch['processed_text'].tolist())

In [6]:
# Lists to store indices and tokens
empty_token_indices = []  # List to store indices of empty tokens
words = []  # List to store non-empty tokens
appended_indices = []  # List to store indices of corpus items appended to words

for i, sent in enumerate(tqdm(corpus)):
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        tokens = simple_preprocess(sent, min_len=2, max_len=15)
        if not tokens:  # If the token list is empty
            empty_token_indices.append(i)  # Store the index
        else:
            words.append(tokens)
            appended_indices.append(i)  # Store the index of the original corpus

100%|██████████| 52681/52681 [00:04<00:00, 11228.86it/s]


In [7]:
# Convert appended_indices to a set for faster lookup
indices_to_keep = set(appended_indices)
# Filter the DataFrame to keep only rows whose index is in appended_indices
df_filtered = df[df.index.isin(indices_to_keep)]
# Alternatively, if you prefer to remove the rows not in appended_indices:
#df_filtered = df.drop(df.index.difference(appended_indices))

In [8]:
#Train the word 2 Vec from scratch
model_w2v = gensim.models.Word2Vec(words, window=7,min_count=5)

In [9]:
model_w2v.wv.index_to_key  #to check the vocabulary 

['like',
 'feel',
 'want',
 'know',
 'life',
 'get',
 'time',
 'even',
 'people',
 'would',
 'day',
 'year',
 'thing',
 'really',
 'one',
 'cannot',
 'going',
 'think',
 'go',
 'friend',
 'make',
 'much',
 'never',
 'help',
 'feeling',
 'could',
 'work',
 'thought',
 'anymore',
 'anxiety',
 'back',
 'anything',
 'way',
 'take',
 'still',
 'depression',
 'something',
 'good',
 'got',
 'always',
 'everything',
 'anyone',
 'need',
 'better',
 'every',
 'see',
 'nothing',
 'month',
 'someone',
 'also',
 'family',
 'bad',
 'since',
 'job',
 'hate',
 'right',
 'week',
 'say',
 'last',
 'love',
 'end',
 'live',
 'fucking',
 'getting',
 'keep',
 'talk',
 'die',
 'lot',
 'ever',
 'care',
 'everyone',
 'long',
 'person',
 'try',
 'trying',
 'tired',
 'point',
 'pain',
 'started',
 'hard',
 'around',
 'else',
 'well',
 'tell',
 'school',
 'first',
 'find',
 'away',
 'happy',
 'felt',
 'shit',
 'told',
 'alone',
 'come',
 'made',
 'tried',
 'parent',
 'said',
 'depressed',
 'ago',
 'stop',
 'sleep

In [10]:
model_w2v.wv.similar_by_word('education')   #function to check the similarty (sementic word) word in the given text data 

[('degree', 0.8473808169364929),
 ('field', 0.8235071897506714),
 ('academic', 0.8171334862709045),
 ('scholarship', 0.8151994943618774),
 ('student', 0.8049637675285339),
 ('finance', 0.8041580319404602),
 ('engineering', 0.802412211894989),
 ('bachelor', 0.7974151372909546),
 ('career', 0.788594126701355),
 ('salary', 0.7884148359298706)]

In [11]:
#Now creating function for  Average word2vec for simpler model
def avg_word2vec(doc, model_w2v, vector_size):
    valid_words = [model_w2v.wv[word] for word in doc if word in model_w2v.wv.index_to_key]
    if len(valid_words) > 0:
        return np.mean(valid_words, axis=0)
    else:
        # Return a zero vector if no valid words are found
        return np.zeros(vector_size)

In [12]:
#apply Average Word2vec for the entire corpus
X = []
vector_size = model_w2v.vector_size  # Get the size of the word2vec vectors
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i], model_w2v, vector_size))

100%|██████████| 52497/52497 [01:20<00:00, 648.15it/s] 


In [13]:
X = np.array(X)
y = df_filtered['status']

In [14]:
#Train Test Split
w2v_X_train , w2v_X_test, w2v_y_train, w2v_y_test = train_test_split(X, y, test_size=0.2, random_state = 9)

# Modelling
**Random Forest Classifier with default parameters**

In [15]:
from sklearn.ensemble import RandomForestClassifier

Emotion_classifier_model_W2V = RandomForestClassifier(min_samples_split=10)
Emotion_classifier_model_W2V.fit(w2v_X_train,w2v_y_train)

#Prediction
ypred_test = Emotion_classifier_model_W2V.predict(w2v_X_test)
ypred_train = Emotion_classifier_model_W2V.predict(w2v_X_train)

# Evaluation

print("Train Accuracy", accuracy_score(w2v_y_train,ypred_train))
print("Test Accuracy", accuracy_score(w2v_y_test,ypred_test))
report = classification_report(w2v_y_test, ypred_test)
print("Classification Report:\n", report)

Train Accuracy 0.9772364692716147
Test Accuracy 0.685047619047619
Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.81      0.60      0.69       811
             Bipolar       0.75      0.41      0.53       538
          Depression       0.56      0.71      0.63      2993
              Normal       0.79      0.94      0.86      3332
Personality disorder       1.00      0.24      0.38       216
              Stress       0.92      0.21      0.34       514
            Suicidal       0.64      0.51      0.56      2096

            accuracy                           0.69     10500
           macro avg       0.78      0.52      0.57     10500
        weighted avg       0.70      0.69      0.67     10500



In [16]:
# Trying it on the new text

# New sentence to classify
new_sentence = input("Enter your new sentence here: ")

# Preprocess the new sentence
def preprocess_single_sentence(text):
    'Applying lemmetization and removing english stop words'
    cleaned_text = re.sub('[^a-zA-Z]', " ", text)
    cleaned_text = cleaned_text.lower()
    words = cleaned_text.split()
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    processed_text = " ".join(processed_words)
    return processed_text

processed_sentence = preprocess_single_sentence(new_sentence)

processed_sentence = simple_preprocess(processed_sentence, min_len=2, max_len=15)

# Step 3: Transform the sentence using the avg_word2vec function
new_sentence_vector = avg_word2vec(processed_sentence, model_w2v, vector_size).reshape(1, -1)  # Reshape to 2D array

# Make prediction using the trained model
prediction = Emotion_classifier_model_W2V.predict(new_sentence_vector)
prediction_proba = Emotion_classifier_model_W2V.predict_proba(new_sentence_vector)

# Map prediction to emotions
mental_state = {
    0: "Anxiety",
    1: "Bipolar",
    2: "Depression",
    3: "Normal",
    4: "Personality disorder",
    5: "Stress",
    6: "Suicidal"
}

for index, proba in enumerate(prediction_proba[0]):
    emotion = mental_state[index]
    print(f"{emotion}: {proba * 100:.2f}%")
print("Prediction probabilities of your current mental state is  :", prediction)

Enter your new sentence here:  Tired of living daily like everything okay


Anxiety: 4.82%
Bipolar: 3.47%
Depression: 34.53%
Normal: 13.97%
Personality disorder: 0.83%
Stress: 5.78%
Suicidal: 36.61%
Prediction probabilities of your current mental state is  : ['Suicidal']
