In [8]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [9]:
# downloading NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\michp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\michp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\michp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
file_path = 'mental_health.csv'

In [11]:
reddit_depression_path = "./reddit_depression"
reddit_non_depression_path = "./reddit_non_depression"
blog_depression_path = "./blogs_depression"
blog_non_depression_path = "./blogs_non_depression"
mixed_depression_path = "./mixed_depression"
mixed_non_depression_path = "./mixed_non_depression"


## read and merge text files from folders

In [12]:

def read_files_from_directory(directory_path, label):
    texts = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='latin-1') as file:
                texts.append((file.read(), label))
    return texts

In [14]:
reddit_depression_texts = read_files_from_directory(reddit_depression_path, 1)
reddit_non_depression_texts = read_files_from_directory(reddit_non_depression_path, 0)
blog_depression_texts = read_files_from_directory(blog_depression_path, 1)
blog_non_depression_texts = read_files_from_directory(blog_non_depression_path, 0)
mixed_depression_texts = read_files_from_directory(mixed_depression_path, 1)
mixed_non_depression_texts = read_files_from_directory(mixed_non_depression_path, 0)

all_texts = reddit_depression_texts + reddit_non_depression_texts + blog_depression_texts + blog_non_depression_texts + mixed_depression_texts + mixed_non_depression_texts
texts, labels = zip(*all_texts)

In [15]:

csv_data = pd.read_csv(file_path)
csv_texts = csv_data['text'] 
csv_labels = csv_data['label']  

texts += tuple(csv_texts)
labels += tuple(csv_labels)

In [16]:
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove special characters and numbers
    tokens = word_tokenize(text)  # tokenize
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokens]  # lemmatize the words
    return ' '.join(lemmatized_text)

In [17]:
# preprocessing
processed_texts = [preprocess_text(text) for text in texts]

df = pd.DataFrame({
    'processed_text': processed_texts,
    'label': labels
})

In [18]:
# tf-idf feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']

In [19]:
# splits the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [21]:
# evaluates the model
print("model accuracy:", accuracy_score(y_test, y_pred))
print("\nclassification report:\n", classification_report(y_test, y_pred))

model accuracy: 0.9066823667942302

classification report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      5108
           1       0.92      0.89      0.90      5083

    accuracy                           0.91     10191
   macro avg       0.91      0.91      0.91     10191
weighted avg       0.91      0.91      0.91     10191



In [22]:
# new text
new_text = ["I'm feeling really stressed about work.", "Today is a great day, I feel so happy!", "I wanna kill myself, I'm so stressed.", "I feel like I have nothing left to live for."]

# preprocess the new tweets
processed_new_text = [preprocess_text(tweet) for tweet in new_text]

# transform the new tweets using the same TF-IDF vectorizer
X_new = vectorizer.transform(processed_new_text)

# predict using the trained model
new_predictions = model.predict(X_new)

# prints the predictions
for tweet, pred in zip(new_text, new_predictions):
    print(f"Text: {tweet}\nPredicted Category: {pred}\n")


Tweet: I'm feeling really stressed about work.
Predicted Category: 0

Tweet: Today is a great day, I feel so happy!
Predicted Category: 0

Tweet: I wanna kill myself, I'm so stressed.
Predicted Category: 1

Tweet: I feel like I have nothing left to live for.
Predicted Category: 1



### takes user input when run and evaluates text based on the trained model

In [33]:
# function to map numerical labels to descriptive categories
def label_to_category(label):
    if label == 0:
        return 'No need for mental health intervention'
    elif label == 1:
        return 'Possible need for mental health intervention'
    else:
        return 'Unknown'

# function to get a prediction for a single tweet or text
def predict_text(model, vectorizer, text):
    processed_text = preprocess_text(text)  # preprocess
    transformed_text = vectorizer.transform([processed_text])  # transform
    prediction = model.predict(transformed_text)  # predict
    return label_to_category(prediction[0])  # convert to category

# user input
user_input = input("Enter a tweet or text: ")
predicted_category = predict_text(model, vectorizer, user_input)
print(f"Predicted Category: {predicted_category}")


Predicted Category: No need for mental health intervention
