In [3]:
import pandas as pd

# Load the CSV file
data = pd.read_csv("/content/Impetus2025/Combined Data.csv")

In [4]:
print(data.isnull().sum())

Unnamed: 0      0
statement     362
status          0
dtype: int64


In [5]:
print(data['status'].value_counts())

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64


In [11]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the necessary data for tokenization
nltk.download('wordnet') # Download wordnet

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Check if text is a string before processing
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        # Handle non-string values (e.g., NaN)
        return ''  # or any other appropriate handling

data['cleaned_statement'] = data['statement'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf.fit_transform(data['cleaned_statement']).toarray()
y = data['status']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7595437835799793
Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.81      0.77      0.79       779
             Bipolar       0.87      0.69      0.77       580
          Depression       0.71      0.72      0.72      3100
              Normal       0.83      0.96      0.89      3327
Personality disorder       0.86      0.43      0.57       248
              Stress       0.73      0.45      0.56       557
            Suicidal       0.66      0.63      0.65      2018

            accuracy                           0.76     10609
           macro avg       0.78      0.66      0.71     10609
        weighted avg       0.76      0.76      0.75     10609



In [16]:
model = LogisticRegression(max_iter=1000)  # Increase max_iter
model.fit(X_train, y_train)

In [17]:
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7383353756244698
Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.77      0.83      0.80       779
             Bipolar       0.73      0.78      0.75       580
          Depression       0.78      0.60      0.68      3100
              Normal       0.88      0.87      0.87      3327
Personality disorder       0.39      0.69      0.50       248
              Stress       0.48      0.65      0.55       557
            Suicidal       0.63      0.71      0.67      2018

            accuracy                           0.74     10609
           macro avg       0.67      0.73      0.69     10609
        weighted avg       0.76      0.74      0.74     10609



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 500, 1000]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_