In [1]:
import pandas as pd

In [2]:
review = pd.read_csv("Augdata.csv")  # Assuming tab-delimited
review = review.rename(columns = {'Text data': 'data'}, inplace = False)
review.head()

Unnamed: 0,PID,data,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [7]:
!pip install nltk



In [8]:
import nltk
nltk.download('punkt')
#nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.', ',', '!', '?', ';', ':'])

def remove_stopwords(text):
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  return ' '.join(filtered_tokens)

review['data'] = review['data'].apply(remove_stopwords)
review.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,PID,data,Label
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,moderate
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,moderate
2,dev_pid_3,Best suicide method like quick easy deformitie...,moderate
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,moderate
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,moderate


In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
review["Label"] = le.fit_transform(review["Label"])
review.head()

Unnamed: 0,PID,data,Label
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,0
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,0
2,dev_pid_3,Best suicide method like quick easy deformitie...,0
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,0
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,0


In [10]:
from sklearn.model_selection import train_test_split
X = review.data
y = review.Label
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 1)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', SVC())
])


In [12]:
# Train the model
model_pipeline_lr.fit(X_train, y_train)

In [13]:
# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [14]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

In [15]:
# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.65      0.78      0.71       460
           1       0.64      0.51      0.57       358
           2       0.96      0.90      0.93       298

    accuracy                           0.72      1116
   macro avg       0.75      0.73      0.73      1116
weighted avg       0.73      0.72      0.72      1116



In [16]:
from joblib import dump
dump(model_pipeline_lr, 'classification_depression')

['classification_depression']