# Setup

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import emoji
from warnings import filterwarnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GRU, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [2]:
PATH = "../data/data_cleaned.csv"
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,status,cleaned_text,statement length,contains_num,capital_ratio,word_count,avg_word_length,sentence_count,avg_sentence_length,lemmatized_text
0,Anxiety,oh my gosh,10,0,0.0,3,2.666667,1,3.0,oh gosh
1,Anxiety,trouble sleeping confused mind restless heart ...,64,0,0.019231,10,4.230769,2,5.0,trouble sleeping confused mind restless heart ...
2,Anxiety,all wrong back off dear forward doubt stay in ...,78,0,0.032258,14,3.823529,2,7.0,wrong back dear forward doubt stay restless re...
3,Anxiety,i have shifted my focus to something else but ...,61,0,0.040816,13,3.923077,1,11.0,shifted focus something else still worried
4,Anxiety,i am restless and restless it is been a month ...,72,0,0.037736,16,2.95,2,7.0,restless restless month boy mean


In [10]:
df[df['lemmatized_text'].isnull()]

Unnamed: 0,status,cleaned_text,statement length,contains_num,capital_ratio,word_count,avg_word_length,sentence_count,avg_sentence_length,lemmatized_text
722,Normal,what should i do,20,0,0.153846,4,2.285714,1,5.0,
1203,Normal,just on again,13,0,0.090909,3,3.666667,1,3.0,
1842,Normal,where are you,14,0,0.000000,3,3.000000,1,3.0,
2321,Normal,where is this,13,0,0.090909,3,3.666667,1,3.0,
2596,Normal,can you not,12,0,0.111111,3,2.500000,1,3.0,
...,...,...,...,...,...,...,...,...,...,...
33413,Normal,how did you do that,20,0,0.000000,5,2.666667,1,5.0,
33473,Normal,how can he do that,19,0,0.000000,5,2.500000,1,5.0,
33559,Normal,why did they do that,21,0,0.000000,5,2.833333,1,5.0,
35858,Suicidal,,47,1,0.294118,8,7.666667,1,2.0,


In [11]:
df.dropna(subset=['lemmatized_text'], inplace=True)

In [12]:
LABEL = 'status'

In [13]:
X = df.drop(LABEL, axis=1)
y = df[LABEL]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Tf-idf Vectorizer and Label Encoding

In [14]:
# Train vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_feat = X_train.drop(['lemmatized_text'], axis=1).values
X_train_text = tfidf_vectorizer.fit_transform(X_train['lemmatized_text']).toarray()
X_train_all = np.hstack((X_train_feat, X_train_text))

X_test_feat = X_test.drop(['lemmatized_text'], axis=1).values
X_test_text = tfidf_vectorizer.transform(X_test['lemmatized_text']).toarray()
X_test_all = np.hstack((X_test_feat, X_test_text))

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

## Baseline Model

In [16]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Multinomial Naive Bayes': MultinomialNB(),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1)
}

def modelling(X_train, y_train, X_test, y_test, models):
    name_l, f1_scores = [], []
    for name, model in models.items():
        print(f"Model: {name}")
        name_l.append(name)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f"F1 Score: {f1:.4f}")
        f1_scores.append(f1)
        print(classification_report(y_test, y_pred, target_names=encoder.classes_))
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
        plt.title(f'Confusion Matrix for {name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    return pd.DataFrame({'Model': name_l, 'F1 Score': f1_scores})

results = modelling(X_train_all, y_train_encoded, X_test_all, y_test_encoded, models)
results.sort_values(by='F1 Score', ascending=False, inplace=True)
        

Model: Random Forest


MemoryError: Unable to allocate 779. MiB for an array with shape (40795, 5008) and data type float32

## Deep Learning Model

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

seqs = tokenizer.texts_to_sequences(X_train)
MAXLEN = max(len(seq) for seq in seqs)
X_train = pad_sequences(seqs, padding='post', maxlen=MAXLEN)

seqs = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(seqs, padding='post', maxlen=MAXLEN)

print(len(tokenizer.word_index))

In [None]:
def create_model(X, y, input_length, num_words=10000):
  model = Sequential([
      Embedding(input_dim=num_words, output_dim=100, input_length=input_length),
      Bidirectional(LSTM(128, return_sequences=True)),
      GlobalMaxPooling1D(),
      # Dropout(0.2),
      # GRU(64),
      Dropout(0.3),
      Dense(32, activation='relu'),
      Dropout(0.3),
      Dense(df[LABEL].nunique(), activation='softmax')
  ])

  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

model = create_model(X_train, y_train, X_train.shape[1])
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, validation_split=0.2, verbose=1, batch_size=32)
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))