# LOGISTIC REGRESSION

## TFIDF

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the TfidfVectorizer and fit/transform the cleaned text
TFIDFvectorizer = TfidfVectorizer()
X = TFIDFvectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
LRmodel = LogisticRegression(multi_class='ovr', max_iter=5000)

# Train the model
LRmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = LRmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 86.02%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.85      0.70      0.77       379
     bipolar       0.85      0.47      0.60       384
  depression       0.75      0.76      0.76       373
      normal       0.87      1.00      0.93      2183
        ptsd       0.91      0.73      0.81       394

    accuracy                           0.86      3713
   macro avg       0.85      0.73      0.77      3713
weighted avg       0.86      0.86      0.85      3713

Confusion Matrix:
 [[ 266    7   32   67    7]
 [   6  180   33  157    8]
 [  20   19  284   39   11]
 [   1    2    3 2176    1]
 [  21    5   27   53  288]]


## LIWC

In [2]:
!pip install empath

Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57798 sha256=71173d3f3659c6d8d02dff6c2076564436526401cddc902e4342944e2c4c2b9c
  Stored in directory: /root/.cache/pip/wheels/92/b3/83/9eb2c6199881e2385a59d99bd911363475060ebeb4bdb27242
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
LRmodel = LogisticRegression(multi_class='ovr', max_iter=5000)

# Train the model
LRmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = LRmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 66.36%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.64      0.50      0.56       379
     bipolar       0.47      0.05      0.09       384
  depression       0.52      0.25      0.33       373
      normal       0.68      0.97      0.80      2183
        ptsd       0.59      0.12      0.20       394

    accuracy                           0.66      3713
   macro avg       0.58      0.38      0.40      3713
weighted avg       0.63      0.66      0.59      3713

Confusion Matrix:
 [[ 191    4   11  165    8]
 [  30   18   33  292   11]
 [  29    4   92  244    4]
 [  24   10   23 2115   11]
 [  24    2   19  301   48]]


## WORD2VEC

In [12]:
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')
# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
LRmodel = LogisticRegression(multi_class='ovr', max_iter=5000)

# Train the model
LRmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = LRmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 79.40%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.75      0.66      0.70       379
     bipolar       0.65      0.42      0.51       384
  depression       0.59      0.66      0.63       373
      normal       0.86      0.96      0.91      2183
        ptsd       0.70      0.49      0.57       394

    accuracy                           0.79      3713
   macro avg       0.71      0.64      0.66      3713
weighted avg       0.78      0.79      0.78      3713

Confusion Matrix:
 [[ 252   18   41   53   15]
 [  20  161   51  122   30]
 [  24   35  248   49   17]
 [  16   21   31 2095   20]
 [  24   11   48  119  192]]


## N-GRAM (N=3)

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
LRmodel = LogisticRegression(multi_class='ovr', max_iter=5000)

# Train the model
LRmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = LRmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 87.13%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.83      0.77      0.80       379
     bipolar       0.75      0.53      0.62       384
  depression       0.76      0.75      0.75       373
      normal       0.91      0.99      0.95      2183
        ptsd       0.89      0.74      0.81       394

    accuracy                           0.87      3713
   macro avg       0.83      0.76      0.79      3713
weighted avg       0.87      0.87      0.86      3713

Confusion Matrix:
 [[ 293   17   24   33   12]
 [   7  205   32  131    9]
 [  24   31  279   26   13]
 [   2    8    4 2167    2]
 [  27   11   28   37  291]]


# NAIVE BAYES

## TFIDF

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the TfidfVectorizer and fit/transform the cleaned text
TFIDFvectorizer = TfidfVectorizer()
X = TFIDFvectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model
NBmodel = MultinomialNB()

# Train the model
NBmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = NBmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 79.42%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.84      0.50      0.63       379
     bipolar       0.96      0.17      0.30       384
  depression       0.69      0.73      0.71       373
      normal       0.80      1.00      0.89      2183
        ptsd       0.77      0.60      0.68       394

    accuracy                           0.79      3713
   macro avg       0.81      0.60      0.64      3713
weighted avg       0.81      0.79      0.76      3713

Confusion Matrix:
 [[ 190    0   43  122   24]
 [  16   67   49  213   39]
 [  11    1  274   79    8]
 [   0    0    3 2180    0]
 [   9    2   27  118  238]]


## LIWC

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model
NBmodel = MultinomialNB()

# Train the model
NBmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = NBmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 59.63%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.55      0.08      0.14       379
     bipolar       0.00      0.00      0.00       384
  depression       0.48      0.03      0.06       373
      normal       0.60      0.99      0.75      2183
        ptsd       1.00      0.00      0.01       394

    accuracy                           0.60      3713
   macro avg       0.53      0.22      0.19      3713
weighted avg       0.56      0.60      0.46      3713

Confusion Matrix:
 [[  31    0    1  347    0]
 [  11    0    6  367    0]
 [   2    0   11  360    0]
 [   7    2    3 2171    0]
 [   5    0    2  386    1]]


## WORD2VEC

In [17]:
!pip install scikit-learn
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training and test data to the range [0, 1]
X_train = scaler.fit_transform(X_train) # Fit the scaler on training data and transform
X_test = scaler.transform(X_test) # Transform the test data using the fitted scaler


# Initialize the Naive Bayes model (MultinomialNB)
NBmodel = MultinomialNB()

# Train the model
NBmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = NBmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 60.44%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.55      0.03      0.06       379
     bipolar       0.00      0.00      0.00       384
  depression       0.44      0.19      0.26       373
      normal       0.61      0.99      0.76      2183
        ptsd       0.33      0.00      0.01       394

    accuracy                           0.60      3713
   macro avg       0.39      0.24      0.22      3713
weighted avg       0.50      0.60      0.48      3713

Confusion Matrix:
 [[  11    0   25  343    0]
 [   6    0   26  351    1]
 [   1    0   71  301    0]
 [   0    0   21 2161    1]
 [   2    0   20  371    1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## N-GRAM (N=3)

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model (MultinomialNB)
NBmodel = MultinomialNB()

# Train the model
NBmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = NBmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 81.71%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.72      0.68      0.70       379
     bipolar       0.94      0.23      0.37       384
  depression       0.54      0.89      0.67       373
      normal       0.95      0.93      0.94      2183
        ptsd       0.64      0.84      0.72       394

    accuracy                           0.82      3713
   macro avg       0.76      0.71      0.68      3713
weighted avg       0.85      0.82      0.81      3713

Confusion Matrix:
 [[ 257    0   72    4   46]
 [  36   89  104   92   63]
 [  22    1  331    0   19]
 [  27    3   66 2026   61]
 [  14    2   44    3  331]]


# SVM

## TFIDF

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the TfidfVectorizer and fit/transform the cleaned text
TFIDFvectorizer = TfidfVectorizer()
X = TFIDFvectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
SVMmodel = SVC(kernel='linear', random_state=42)  # You can experiment with other kernels like 'rbf'

# Train the model
SVMmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = SVMmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 88.26%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.81      0.78      0.80       379
     bipolar       0.76      0.61      0.68       384
  depression       0.74      0.77      0.76       373
      normal       0.93      0.99      0.96      2183
        ptsd       0.90      0.76      0.82       394

    accuracy                           0.88      3713
   macro avg       0.83      0.78      0.80      3713
weighted avg       0.88      0.88      0.88      3713

Confusion Matrix:
 [[ 295   18   32   23   11]
 [   8  236   31  101    8]
 [  30   30  286   14   13]
 [   2   10    7 2162    2]
 [  27   16   28   25  298]]


## LIWC

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Support Vector Classifier (SVC) model
SVCmodel = SVC(kernel='linear', random_state=42)  # You can try other kernels like 'rbf' as well

# Train the model
SVCmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = SVCmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 68.14%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.69      0.56      0.62       379
     bipolar       0.41      0.08      0.13       384
  depression       0.56      0.30      0.39       373
      normal       0.70      0.97      0.81      2183
        ptsd       0.56      0.15      0.24       394

    accuracy                           0.68      3713
   macro avg       0.58      0.41      0.44      3713
weighted avg       0.64      0.68      0.62      3713

Confusion Matrix:
 [[ 214    6   12  134   13]
 [  23   30   38  277   16]
 [  23   15  112  215    8]
 [  24   15   19 2114   11]
 [  28    7   19  280   60]]


## WORD2VEC

In [21]:
!pip install scikit-learn
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training and test data to the range [0, 1]
X_train = scaler.fit_transform(X_train)  # Fit the scaler on training data and transform
X_test = scaler.transform(X_test)  # Transform the test data using the fitted scaler

# Initialize the Support Vector Classifier (SVC) model
SVCmodel = SVC(kernel='linear', random_state=42)  # You can also try 'rbf' kernel for better performance

# Train the model
SVCmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = SVCmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 77.89%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.68      0.59      0.63       379
     bipolar       0.55      0.37      0.44       384
  depression       0.56      0.68      0.62       373
      normal       0.89      0.95      0.92      2183
        ptsd       0.57      0.51      0.54       394

    accuracy                           0.78      3713
   macro avg       0.65      0.62      0.63      3713
weighted avg       0.77      0.78      0.77      3713

Confusion Matrix:
 [[ 222   39   50   39   29]
 [  37  143   50  102   52]
 [  22   36  255   29   31]
 [  17   20   38 2070   38]
 [  30   23   59   80  202]]


## N-GRAM (N=3)

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Support Vector Classifier (SVC) model
SVCmodel = SVC(kernel='linear', random_state=42)  # You can also try 'rbf' kernel for better performance

# Train the model
SVCmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = SVCmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 84.33%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.74      0.74      0.74       379
     bipolar       0.58      0.56      0.57       384
  depression       0.71      0.73      0.72       373
      normal       0.93      0.95      0.94      2183
        ptsd       0.83      0.73      0.77       394

    accuracy                           0.84      3713
   macro avg       0.76      0.74      0.75      3713
weighted avg       0.84      0.84      0.84      3713

Confusion Matrix:
 [[ 279   26   33   24   17]
 [  19  215   39   98   13]
 [  31   34  272   16   20]
 [  17   70    8 2079    9]
 [  30   25   33   20  286]]


# KNN

## TFIDF

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the TfidfVectorizer and fit/transform the cleaned text
TFIDFvectorizer = TfidfVectorizer()
X = TFIDFvectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN model
KNNmodel = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values for 'n_neighbors'

# Train the model
KNNmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = KNNmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 75.06%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.68      0.54      0.60       379
     bipolar       0.45      0.53      0.48       384
  depression       0.55      0.54      0.54       373
      normal       0.84      0.95      0.89      2183
        ptsd       0.82      0.26      0.39       394

    accuracy                           0.75      3713
   macro avg       0.67      0.56      0.58      3713
weighted avg       0.75      0.75      0.73      3713

Confusion Matrix:
 [[ 204   41   40   89    5]
 [  23  202   37  120    2]
 [  20   70  201   69   13]
 [  14   78   11 2078    2]
 [  38   58   76  120  102]]


## LIWC

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN model
KNNmodel = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values for 'n_neighbors'

# Train the model
KNNmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = KNNmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 70.70%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.54      0.59      0.56       379
     bipolar       0.31      0.23      0.26       384
  depression       0.44      0.47      0.45       373
      normal       0.86      0.91      0.88      2183
        ptsd       0.50      0.36      0.42       394

    accuracy                           0.71      3713
   macro avg       0.53      0.51      0.52      3713
weighted avg       0.69      0.71      0.70      3713

Confusion Matrix:
 [[ 225   33   47   48   26]
 [  43   89   48  170   34]
 [  52   64  175   42   40]
 [  33   63   53 1995   39]
 [  66   39   76   72  141]]


## WORD2VEC

In [40]:

import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training and test data to the range [0, 1]
X_train = scaler.fit_transform(X_train)  # Fit the scaler on training data and transform
X_test = scaler.transform(X_test)  # Transform the test data using the fitted scaler

# Initialize the KNN model
KNNmodel = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values for 'n_neighbors'

# Train the model
KNNmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = KNNmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 75.52%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.52      0.60      0.56       379
     bipolar       0.46      0.36      0.41       384
  depression       0.47      0.59      0.52       373
      normal       0.94      0.93      0.93      2183
        ptsd       0.54      0.47      0.50       394

    accuracy                           0.76      3713
   macro avg       0.59      0.59      0.58      3713
weighted avg       0.76      0.76      0.75      3713

Confusion Matrix:
 [[ 226   33   63   18   39]
 [  59  138   53   85   49]
 [  67   40  221    6   39]
 [  24   38   55 2035   31]
 [  57   48   77   28  184]]


In [41]:
import pickle

# Save the trained KNN model as a pickle file
with open('KNNmodel.pkl', 'wb') as model_file:
    pickle.dump(KNNmodel, model_file)

# Save the fitted scaler as a pickle file
with open('KNNscaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler have been saved as 'KNNmodel.pkl' and 'scaler.pkl' respectively.")


Model and scaler have been saved as 'KNNmodel.pkl' and 'scaler.pkl' respectively.


## N-GRAM (N=3)

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN model
KNNmodel = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values of 'n_neighbors'

# Train the model
KNNmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = KNNmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 43.06%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.59      0.12      0.20       379
     bipolar       0.13      0.60      0.22       384
  depression       0.57      0.10      0.18       373
      normal       0.70      0.59      0.64      2183
        ptsd       1.00      0.01      0.02       394

    accuracy                           0.43      3713
   macro avg       0.60      0.29      0.25      3713
weighted avg       0.65      0.43      0.44      3713

Confusion Matrix:
 [[  47  191    6  135    0]
 [   4  231    7  142    0]
 [  10  209   39  115    0]
 [   4  900    1 1278    0]
 [  15  197   16  162    4]]


# RANDOM FOREST

## TFIDF

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the TfidfVectorizer and fit/transform the cleaned text
TFIDFvectorizer = TfidfVectorizer()
X = TFIDFvectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model with the provided hyperparameters
RFmodel = RandomForestClassifier(
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42
)

# Train the model
RFmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RFmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 85.73%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.79      0.70      0.74       379
     bipolar       0.93      0.49      0.64       384
  depression       0.71      0.75      0.73       373
      normal       0.88      0.99      0.93      2183
        ptsd       0.87      0.72      0.79       394

    accuracy                           0.86      3713
   macro avg       0.84      0.73      0.77      3713
weighted avg       0.86      0.86      0.85      3713

Confusion Matrix:
 [[ 264    1   34   66   14]
 [  19  189   42  124   10]
 [  27    3  280   45   18]
 [   2   10    3 2168    0]
 [  21    1   35   55  282]]


In [48]:
import pickle

# Save the trained Random Forest model as a pickle file
with open('RFmodel.pkl', 'wb') as model_file:
    pickle.dump(RFmodel, model_file)

# Save the fitted TfidfVectorizer as a pickle file
with open('RFvectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(TFIDFvectorizer, vectorizer_file)

print("Random Forest model and TfidfVectorizer have been saved as 'RFmodel.pkl' and 'TFIDFvectorizer.pkl' respectively.")


Random Forest model and TfidfVectorizer have been saved as 'RFmodel.pkl' and 'TFIDFvectorizer.pkl' respectively.


## LIWC

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model with provided parameters
RFmodel = RandomForestClassifier(
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42
)

# Train the model
RFmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RFmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 76.73%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.66      0.67      0.66       379
     bipolar       0.47      0.14      0.22       384
  depression       0.59      0.64      0.62       373
      normal       0.84      0.96      0.90      2183
        ptsd       0.63      0.51      0.56       394

    accuracy                           0.77      3713
   macro avg       0.64      0.58      0.59      3713
weighted avg       0.74      0.77      0.74      3713

Confusion Matrix:
 [[ 253    3   28   66   29]
 [  36   54   60  187   47]
 [  37   10  239   57   30]
 [   9   42   19 2103   10]
 [  51    7   57   79  200]]


## WORD2VEC

In [29]:
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training and test data to the range [0, 1]
X_train = scaler.fit_transform(X_train)  # Fit the scaler on training data and transform
X_test = scaler.transform(X_test)  # Transform the test data using the fitted scaler

# Initialize the Random Forest model with the provided parameters
RFmodel = RandomForestClassifier(
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42
)

# Train the model
RFmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RFmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 79.88%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.64      0.59      0.61       379
     bipolar       0.58      0.38      0.46       384
  depression       0.59      0.68      0.63       373
      normal       0.91      0.97      0.94      2183
        ptsd       0.64      0.55      0.59       394

    accuracy                           0.80      3713
   macro avg       0.67      0.63      0.65      3713
weighted avg       0.79      0.80      0.79      3713

Confusion Matrix:
 [[ 222   31   55   43   28]
 [  36  145   42  116   45]
 [  38   26  255   18   36]
 [   5   23   14 2126   15]
 [  44   23   65   44  218]]


## N-GRAM (N=3)

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model with default parameters or specific ones as needed
RFmodel = RandomForestClassifier(
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42
)

# Train the model
RFmodel.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RFmodel.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 79.02%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.77      0.52      0.62       379
     bipolar       0.95      0.33      0.49       384
  depression       0.69      0.63      0.66       373
      normal       0.79      1.00      0.88      2183
        ptsd       0.85      0.50      0.63       394

    accuracy                           0.79      3713
   macro avg       0.81      0.60      0.66      3713
weighted avg       0.80      0.79      0.77      3713

Confusion Matrix:
 [[ 196    1   29  142   11]
 [  15  127   39  188   15]
 [  26    2  236  101    8]
 [   0    3    1 2178    1]
 [  18    1   37  141  197]]


# XGBOOST

## BOW

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
data = pd.read_csv('preprocessed_mental_health.csv')

# Separate features and target
X = data['text']
y = data['mental_health_issue']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))  # Using unigrams, bigrams, and trigrams
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=5, eval_metric='mlogloss', use_label_encoder=False)

# Train the model
xgb_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 87.42%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.83      0.74      0.78       403
     bipolar       0.78      0.60      0.68       397
  depression       0.74      0.79      0.76       387
      normal       0.92      0.99      0.95      2137
        ptsd       0.86      0.76      0.81       396

    accuracy                           0.87      3720
   macro avg       0.83      0.78      0.80      3720
weighted avg       0.87      0.87      0.87      3720

Confusion Matrix:
 [[ 299   24   32   30   18]
 [   9  237   37  104   10]
 [  30   12  307   20   18]
 [   3   17    6 2107    4]
 [  20   13   34   27  302]]


## LISW

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from empath import Empath
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# Initialize Empath
lexicon = Empath()

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Function to get Empath features
def get_empath_features(text):
    analysis = lexicon.analyze(text, normalize=True)  # Normalize to get proportions
    return analysis

# Generate Empath features for each text
empath_features = dataset['cleaned_text'].apply(get_empath_features)

# Convert Empath features to a DataFrame
X = pd.DataFrame(empath_features.tolist())

# Prepare the target variable
y = dataset['mental_health_issue']

# Encode target labels using LabelEncoder
label_encoder = LabelEncoder() # Initialize LabelEncoder
y = label_encoder.fit_transform(y) # Fit and transform the target variable

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=5,  # Adjust the number of classes based on your dataset
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
# Use label_encoder.classes_ to get the original labels for the report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 78.56%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.68      0.64      0.66       379
     bipolar       0.47      0.23      0.31       384
  depression       0.63      0.62      0.63       373
      normal       0.87      0.97      0.92      2183
        ptsd       0.65      0.58      0.61       394

    accuracy                           0.79      3713
   macro avg       0.66      0.61      0.62      3713
weighted avg       0.76      0.79      0.77      3713

Confusion Matrix:
 [[ 241   15   32   51   40]
 [  30   90   52  174   38]
 [  31   36  233   34   39]
 [   9   27   14 2125    8]
 [  45   24   40   57  228]]


## WORD2VEC

In [36]:
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Tokenize the text data into words
def tokenize_text(text):
    return word_tokenize(text.lower())

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_text)

# Train a Word2Vec model using the tokenized data
word2vec_model = gensim.models.Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for each document
def get_document_vector(tokens):
    # Filter out words not in the Word2Vec vocabulary
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if len(valid_tokens) == 0:
        return [0] * word2vec_model.vector_size  # Return a zero vector if no valid tokens
    # Average the Word2Vec vectors of the words in the document
    vectors = [word2vec_model.wv[word] for word in valid_tokens]
    return list(np.mean(vectors, axis=0))

# Convert the text data into document vectors
X = dataset['tokens'].apply(get_document_vector)

# Prepare the target variable
y = dataset['mental_health_issue']

# Encode target labels using LabelEncoder
label_encoder = LabelEncoder() # Initialize LabelEncoder
y = label_encoder.fit_transform(y) # Fit and transform the target variable

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(list(X), y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training and test data to the range [0, 1]
X_train = scaler.fit_transform(X_train)  # Fit the scaler on training data and transform
X_test = scaler.transform(X_test)  # Transform the test data using the fitted scaler

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=5,  # Adjust the number of classes based on your dataset
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Parameters: { "use_label_encoder" } are not used.



Accuracy: 81.01%
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.65      0.67       379
           1       0.55      0.46      0.50       384
           2       0.62      0.69      0.65       373
           3       0.93      0.96      0.94      2183
           4       0.65      0.58      0.62       394

    accuracy                           0.81      3713
   macro avg       0.69      0.67      0.68      3713
weighted avg       0.80      0.81      0.81      3713

Confusion Matrix:
 [[ 245   31   42   28   33]
 [  29  176   36   98   45]
 [  37   39  257   10   30]
 [   9   40   19 2100   15]
 [  34   35   61   34  230]]


## N-GRAM (N=3)

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer with n-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Change (1, 2) to (1, 3) for tri-grams or higher-order n-grams

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Encode target labels using LabelEncoder
label_encoder = LabelEncoder() # Initialize LabelEncoder
y = label_encoder.fit_transform(y) # Fit and transform the target variable

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Multi-class classification
    num_class=len(label_encoder.classes_),  # Number of unique classes # use label_encoder.classes_ to determine num_class
    eval_metric='mlogloss',  # Multi-class log loss evaluation metric
    use_label_encoder=False,  # To suppress warnings
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
# Use label_encoder.classes_ to get the original labels for the report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 87.96%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.85      0.75      0.80       379
     bipolar       0.81      0.52      0.64       384
  depression       0.76      0.80      0.78       373
      normal       0.91      0.99      0.95      2183
        ptsd       0.90      0.79      0.84       394

    accuracy                           0.88      3713
   macro avg       0.84      0.77      0.80      3713
weighted avg       0.88      0.88      0.87      3713

Confusion Matrix:
 [[ 285   15   28   37   14]
 [   8  201   31  139    5]
 [  26   20  299   15   13]
 [   2    3    7 2168    3]
 [  16   10   28   27  313]]


In [44]:
import pickle

# Save the trained XGBoost model as a pickle file
with open('XGBmodel.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)

# Save the fitted CountVectorizer as a pickle file
with open('XGBvectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Save the fitted LabelEncoder as a pickle file
with open('XGBlabel_encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(label_encoder, label_encoder_file)

print("XGBoost model, CountVectorizer, and LabelEncoder have been saved as 'xgb_model.pkl', 'vectorizer.pkl', and 'label_encoder.pkl' respectively.")


XGBoost model, CountVectorizer, and LabelEncoder have been saved as 'xgb_model.pkl', 'vectorizer.pkl', and 'label_encoder.pkl' respectively.
