In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [2]:
review = pd.read_csv("Augdata.csv")
review = review.rename(columns={'Text data': 'data'}, inplace=False)

In [3]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [5]:
wv = api.load('word2vec-google-news-300')



In [6]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [7]:
review['data1'] = review['data'].apply(spacy_tokenizer)

In [8]:
# Generate word vectors for each document
review['vec'] = review['data1'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

In [9]:
X = np.vstack(review['vec'])  # Convert list of arrays to a matrix
y = review['Label']

In [10]:
X = np.vstack(review['vec'])  # Convert list of arrays to a matrix
y = review['Label']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

# **LogisticRegression**

In [11]:
from sklearn.linear_model import LogisticRegression

In [14]:
model_pipeline_lr = Pipeline([
    ('lr', LogisticRegression(max_iter=1000))  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [15]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.79      0.68       460
not depression       0.59      0.40      0.47       358
        severe       0.87      0.76      0.81       298

      accuracy                           0.66      1116
     macro avg       0.68      0.65      0.65      1116
  weighted avg       0.66      0.66      0.65      1116



# **DecisionTree**

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
model_pipeline_lr = Pipeline([
    ('lr', DecisionTreeClassifier())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [18]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics DecisionTree")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics DecisionTree
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.56      0.55      0.56       460
not depression       0.47      0.47      0.47       358
        severe       0.69      0.69      0.69       298

      accuracy                           0.57      1116
     macro avg       0.57      0.57      0.57      1116
  weighted avg       0.57      0.57      0.57      1116



# **RandomForest**

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
model_pipeline_lr = Pipeline([
    ('lr', RandomForestClassifier())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [21]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for RandomForest")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for RandomForest
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.63      0.72      0.67       460
not depression       0.62      0.57      0.59       358
        severe       0.94      0.80      0.86       298

      accuracy                           0.70      1116
     macro avg       0.73      0.70      0.71      1116
  weighted avg       0.71      0.70      0.70      1116



# **SVM**

In [22]:
from sklearn.svm import SVC

In [23]:
model_pipeline_lr = Pipeline([
    ('lr', SVC())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [24]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for SVM")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for SVM
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.60      0.84      0.70       460
not depression       0.70      0.44      0.54       358
        severe       0.92      0.77      0.84       298

      accuracy                           0.69      1116
     macro avg       0.74      0.68      0.69      1116
  weighted avg       0.72      0.69      0.68      1116

