In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import csv
warnings.filterwarnings('ignore')

In [2]:
# df = pd.read_csv('samh.csv');df.head()
df = pd.read_csv('samh.csv', quoting=csv.QUOTE_NONE, on_bad_lines='skip')
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,3,I've shifted my focus to something else but I'...,Anxiety
2,7,Have you ever felt nervous but didn't know why?,Anxiety
3,13,Because this worry is you.,Anxiety
4,14,Sometimes it's your own thoughts that make you...,Anxiety


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8627 entries, 0 to 8626
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  8627 non-null   int64 
 1   statement   8617 non-null   object
 2   status      8627 non-null   object
dtypes: int64(1), object(2)
memory usage: 202.3+ KB


In [4]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
statement,10
status,0


In [5]:
df.status.unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal'], dtype=object)

In [6]:
mode_status = df['statement'].mode()[0]
df['statement'].fillna(mode_status, inplace=True)
print(df.isnull().sum())

Unnamed: 0    0
statement     0
status        0
dtype: int64


In [7]:
# NLP Processing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Text preprocessing function
def clean_text(text):
    text = text.lower() # lowercase
    text = "".join([char for char in text if char not in string.punctuation]) # remove punctuation
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')]) # remove stopwords
    lemmatizer = WordNetLemmatizer() # lemmatize the words
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [9]:
df['clean_statement'] = df['statement'].apply(clean_text)

In [10]:
# Example: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_statement'])
y = df['status']

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB()
}

In [14]:
# Hyperparameter tuning and cross-validation
best_model = None
best_accuracy = 0

for name, model in models.items():
    print(f"Evaluating {name}...")

    # Example hyperparameter grid (adjust based on the model)
    if name == 'Logistic Regression':
        param_grid = {'C': [0.1, 1, 10]}
    elif name == 'Random Forest':
        param_grid = {'n_estimators': [50, 100, 200]}
    elif name == 'SVM':
        param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    else:  # Naive Bayes
        param_grid = {}

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))


    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = grid_search.best_estimator_

# The line containing 'z' has been removed as it serves no purpose
print(f"\nBest performing model: {type(best_model).__name__} with accuracy: {best_accuracy}")

Evaluating Logistic Regression...
Accuracy: 0.779837775202781
              precision    recall  f1-score   support

     Anxiety       0.98      0.68      0.80        69
  Depression       0.68      0.57      0.62       427
      Normal       0.87      0.98      0.92       744
    Suicidal       0.68      0.68      0.68       486

    accuracy                           0.78      1726
   macro avg       0.80      0.73      0.76      1726
weighted avg       0.77      0.78      0.77      1726

Evaluating Random Forest...
Accuracy: 0.7815758980301275
              precision    recall  f1-score   support

     Anxiety       0.97      0.83      0.89        69
  Depression       0.76      0.41      0.53       427
      Normal       0.88      0.99      0.93       744
    Suicidal       0.64      0.79      0.70       486

    accuracy                           0.78      1726
   macro avg       0.81      0.75      0.76      1726
weighted avg       0.78      0.78      0.77      1726

Evaluating 

In [15]:
import joblib
# Save the best model
model_filename = 'best_model.joblib'
joblib.dump(best_model, model_filename)

['best_model.joblib']

In [16]:
# Load the saved model
loaded_model = joblib.load(model_filename)

In [17]:
# Example new sample data (replace with your actual data)
new_statements = ["This is a new statement for prediction.", "Another statement to test the model."]
new_statements_cleaned = [clean_text(text) for text in new_statements]

In [18]:
# Vectorize the new data using the same vectorizer
X_new = vectorizer.transform(new_statements_cleaned)

# Make predictions
new_predictions = loaded_model.predict(X_new)

In [19]:
print("\nPredictions for new statements:")
for statement, prediction in zip(new_statements, new_predictions):
    print(f"Statement: {statement}")
    print(f"Prediction: {prediction}")


Predictions for new statements:
Statement: This is a new statement for prediction.
Prediction: Normal
Statement: Another statement to test the model.
Prediction: Normal
