In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [3]:
review = pd.read_csv("Augdata.csv")
review = review.rename(columns={'Text data': 'data'}, inplace=False)

In [4]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [5]:
wv = api.load('glove-twitter-100')



In [6]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [7]:
review['data1'] = review['data'].apply(spacy_tokenizer)

In [8]:
# Generate word vectors for each document
review['vec'] = review['data1'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

In [9]:
X = np.vstack(review['vec'])  # Convert list of arrays to a matrix
y = review['Label']

# **LogisticRegression**

In [10]:
from sklearn.linear_model import LogisticRegression

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [13]:
# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('lr', LogisticRegression(max_iter=1000))  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)


In [14]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.57      0.77      0.65       460
not depression       0.54      0.34      0.41       358
        severe       0.82      0.75      0.78       298

      accuracy                           0.62      1116
     macro avg       0.64      0.62      0.62      1116
  weighted avg       0.63      0.62      0.61      1116



# **RandomForest**

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('lr', RandomForestClassifier())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [18]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for RandomForest")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for RandomForest
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.69      0.64       460
not depression       0.59      0.54      0.56       358
        severe       0.91      0.76      0.83       298

      accuracy                           0.66      1116
     macro avg       0.70      0.66      0.68      1116
  weighted avg       0.68      0.66      0.67      1116



# **DecisionTree**

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('lr', DecisionTreeClassifier())  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [21]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for DecisionTree")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for DecisionTree
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.55      0.50      0.53       460
not depression       0.45      0.48      0.47       358
        severe       0.68      0.72      0.70       298

      accuracy                           0.55      1116
     macro avg       0.56      0.57      0.56      1116
  weighted avg       0.55      0.55      0.55      1116



# **SVM**

In [22]:
from sklearn.svm import SVC

In [23]:
# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('lr', SVC())  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [25]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for SVM")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for SVM
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.57      0.87      0.69       460
not depression       0.70      0.34      0.46       358
        severe       0.89      0.72      0.80       298

      accuracy                           0.66      1116
     macro avg       0.72      0.64      0.65      1116
  weighted avg       0.70      0.66      0.64      1116

