In [16]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [1]:
!kaggle datasets download -d neelghoshal/reddit-mental-health-data

Dataset URL: https://www.kaggle.com/datasets/neelghoshal/reddit-mental-health-data
License(s): unknown
reddit-mental-health-data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
!unzip /content/reddit-mental-health-data.zip

Archive:  /content/reddit-mental-health-data.zip
replace data_to_be_cleansed.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [3]:
import pandas as pd
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from sklearn.model_selection import train_test_split
import torch

In [4]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df = pd.read_csv('data_to_be_cleansed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1



Targets given have the following mappings:
0 = Stress
1 = Depression
2 = Bipolar disorder
3 = Personality disorder
4 = Anxiety

In [6]:
# Replace NaN or missing values with an empty string
df['text'] = df['text'].fillna('')

# Convert all entries to strings
df['text'] = df['text'].astype(str)

In [7]:
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(text.split())  # Remove extra whitespaces
    return text

# Function for lemmatization (removing stopwords and reducing words to their base form)
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized)

# Custom transformer class to apply the functions on a dataset
class PreprocessText(BaseEstimator, TransformerMixin):
    def __init__(self, clean_func=clean_text, lemmatize_func=lemmatize_text):
        self.clean_func = clean_func
        self.lemmatize_func = lemmatize_func

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.lemmatize_func(self.clean_func(text)) for text in X]

preprocessing_pipeline = Pipeline([
    ('preprocessor', PreprocessText())  # Apply text cleaning and lemmatization
])

In [8]:
# Apply the preprocessing pipeline to the 'text' column
df['cleaned_text'] = preprocessing_pipeline.fit_transform(df['text'])

# Apply the pipeline to a user's response (for chatbot input)
def preprocess_user_input(user_input):
    return preprocessing_pipeline.transform([user_input])[0]

# Example of preprocessed user input
user_input = "I'm feeling really down today, I just can't seem to shake this sadness off."
processed_input = preprocess_user_input(user_input)
print(f"Processed user input: {processed_input}")

Processed user input: im feeling really today cant seem shake sadness




In [9]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [10]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [11]:
test_df['target'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1788 entries, 0 to 1787
Series name: target
Non-Null Count  Dtype
--------------  -----
1788 non-null   int64
dtypes: int64(1)
memory usage: 14.1 KB


In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [13]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
train_encodings = tokenizer(
    train_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Optional: limit the maximum sequence length
)
test_encodings = tokenizer(
    test_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=512,
)

In [15]:
from datasets import Dataset
# Dataset object for training and testing
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_df['target'].tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_df['target'].tolist()
})

In [16]:
print(len(train_dataset), len(train_encodings['input_ids']))  # Ensure these match
print(len(test_dataset), len(test_encodings['input_ids']))

print(train_dataset[2])  # Inspect a sample from your dataset
print(set(train_df['target']))  # Check unique labels

print(len(train_encodings['input_ids']), len(train_df['target']))
print(len(test_encodings['input_ids']), len(test_df['target']))


4169 4169
1788 1788
{'input_ids': [101, 2428, 2245, 1045, 2001, 3773, 2070, 5082, 2197, 2733, 2021, 3984, 2025, 1012, 2074, 8239, 5458, 1997, 2673, 1998, 3110, 2066, 4485, 1012, 6719, 2053, 2028, 2941, 14977, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset
)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.7944


TrainOutput(global_step=783, training_loss=0.6354883810265006, metrics={'train_runtime': 560.5394, 'train_samples_per_second': 22.312, 'train_steps_per_second': 1.397, 'total_flos': 1656858393400320.0, 'train_loss': 0.6354883810265006, 'epoch': 3.0})

In [21]:
torch.save(model.state_dict(), 'model.pth')