In [None]:
import pandas as pd

In [None]:
review = pd.read_csv("Augdata.csv")  # Assuming tab-delimited
review = review.rename(columns = {'Text data': 'data'}, inplace = False)
review.head()

Unnamed: 0,PID,data,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [None]:
count=review["Label"].value_counts()     #count of each labels
print(count)

Label
moderate          2306
not depression    1830
severe            1440
Name: count, dtype: int64


In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')     #used for word tokenizer
nltk.download('stopwords')


stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# Apply the function to the 'data' column
review['tokens'] = review['data'].apply(tokenize_and_remove_stopwords)

# Display the DataFrame with the new 'tokens' column
print(review.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


         PID                                               data     Label  \
0  dev_pid_1  I enjoyed today, and I still am! Tomorrows dep...  moderate   
1  dev_pid_2  I sorta tried to kill myself : I had a total b...  moderate   
2  dev_pid_3  Best suicide method? : I like it quick and eas...  moderate   
3  dev_pid_4  a story : I remember the time I'd get on my 3D...  moderate   
4  dev_pid_5  The world only cares about beautiful people : ...  moderate   

                                              tokens  
0  [enjoyed, today, ,, still, !, Tomorrows, depre...  
1  [sorta, tried, kill, :, total, breakdown, fuck...  
2  [Best, suicide, method, ?, :, like, quick, eas...  
3  [story, :, remember, time, 'd, get, 3DS, play,...  
4  [world, cares, beautiful, people, :, 'm, born,...  


In [None]:
review["tokens"]=review["tokens"].astype(str)    #convets each elements to string

In [None]:
review.head()

Unnamed: 0,PID,data,Label,tokens
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate,"['enjoyed', 'today', ',', 'still', '!', 'Tomor..."
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate,"['sorta', 'tried', 'kill', ':', 'total', 'brea..."
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate,"['Best', 'suicide', 'method', '?', ':', 'like'..."
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate,"['story', ':', 'remember', 'time', ""'d"", 'get'..."
4,dev_pid_5,The world only cares about beautiful people : ...,moderate,"['world', 'cares', 'beautiful', 'people', ':',..."


In [None]:
from sklearn.model_selection import train_test_split
x = review.tokens
y = review.Label
#split data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 1)

# **Logistic Regression**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

# Train the model
model_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(x_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.62      0.71      0.66       460
not depression       0.61      0.53      0.57       358
        severe       0.87      0.83      0.85       298

      accuracy                           0.68      1116
     macro avg       0.70      0.69      0.69      1116
  weighted avg       0.69      0.68      0.68      1116



# **DecisionTree**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with Decision tree
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', DecisionTreeClassifier())
])

# Train the model
model_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(x_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

In [None]:
print("Evaluation Metrics for DecisionTree")
print("------------------------------------------------")
print(classification_report_lr)


Evaluation Metrics for DecisionTree
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.65      0.78      0.71       460
not depression       0.64      0.51      0.57       358
        severe       0.96      0.90      0.93       298

      accuracy                           0.72      1116
     macro avg       0.75      0.73      0.73      1116
  weighted avg       0.73      0.72      0.72      1116



# **SVM**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with SVM
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', SVC())
])

# Train the model
model_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(x_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

In [None]:
print("Evaluation Metrics for SVM")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for SVM
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.65      0.78      0.71       460
not depression       0.64      0.51      0.57       358
        severe       0.96      0.90      0.93       298

      accuracy                           0.72      1116
     macro avg       0.75      0.73      0.73      1116
  weighted avg       0.73      0.72      0.72      1116



# **RandomForestClassifier**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with RandomForest
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', RandomForestClassifier())
])

# Train the model
model_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(x_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

In [None]:
print("Evaluation Metrics for RandomForest")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for RandomForest
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.67      0.77      0.71       460
not depression       0.62      0.52      0.57       358
        severe       0.99      0.94      0.96       298

      accuracy                           0.74      1116
     macro avg       0.76      0.74      0.75      1116
  weighted avg       0.74      0.74      0.73      1116

