# Importing libraries, loading and transforming data

In [1]:
!pip install -U -q mlflow datasets>=2.14.5 nlp 2>/dev/null

In [2]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.14.0-py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tweepy
Successfully installed tweepy-4.14.0


In [3]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import gc  # For garbage collection to manage memory
import re  # For regular expressions
import numpy as np  # For numerical operations and arrays

import warnings  # For handling warnings
warnings.filterwarnings("ignore")  # Ignore warning messages
# import tweepy
import torch  # PyTorch library for deep learning
from transformers import AutoModel, AutoTokenizer  # Transformers library for natural language processing
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, Trainer, TrainingArguments, DataCollatorWithPadding  # Transformers components for text processing
from transformers import AutoModelForSequenceClassification  # Transformer model for sequence classification

from nlp import Dataset  # Import custom 'Dataset' class for natural language processing tasks
from imblearn.over_sampling import RandomOverSampler  # For oversampling to handle class imbalance
import datasets  # Import datasets library
from datasets import Dataset, Image, ClassLabel  # Import custom 'Dataset', 'ClassLabel', and 'Image' classes
from transformers import pipeline  # Transformers library for pipelines
from bs4 import BeautifulSoup  # For parsing HTML content
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt  # For data visualization
import itertools  # For working with iterators
from sklearn.metrics import (  # Import various metrics from scikit-learn
    accuracy_score,  # For calculating accuracy
    roc_auc_score,  # For ROC AUC score
    confusion_matrix,  # For confusion matrix
    classification_report,  # For classification report
    f1_score  # For F1 score
)

from datasets import load_metric  # Import load_metric function to load evaluation metrics

from tqdm import tqdm  # For displaying progress bars
tqdm.pandas()  # Enable progress bars for pandas operations

In [4]:
# Fraction of the dataset used for training, the rest will be used for validation
train_fraction = 0.8

# Number of training epochs
num_train_epochs = 20

# Learning rate
learning_rate = 5e-7

# Batch size for training
train_batch_size = 8

# Batch size for validation
eval_batch_size = 64

# Number of warm-up steps during training
warmup_steps = 50

# Weight decay to control regularization during training
weight_decay = 0.02

# Pre-trained BERT model to be used
BERT_MODEL = "distilbert-base-cased"

# Directory where the model output will be saved
output_dir = "depressed-tweet-detection-distilbert"

In [5]:
%%time
# Read the CSV file into a DataFrame
df = pd.read_csv("/kaggle/input/sentimental-analysis-for-tweets/sentiment_tweets3.csv", encoding='latin-1', index_col='Index')

item0 = df.shape[0]  # Store the initial number of items in the DataFrame
df = df.drop_duplicates()  # Remove duplicate rows from the DataFrame
item1 = df.shape[0]  # Store the number of items in the DataFrame after removing duplicates
print(f"There are {item0-item1} duplicates found in the dataset")  # Print the number of duplicates removed

df = df.rename(columns={'label (depression result)': 'label', 'message to examine': 'title'}) # Rename the columns to standard ones

# update https://stackoverflow.com/a/54206513
URL_REGEX = re.compile("http[s]?://\S+")
MENTION_REGEX = re.compile('@\w+')

def clean_tweet(tweet):
    # remove mentions, the pound sign, and replace urls with URL token
    tweet = re.sub(URL_REGEX, 'url', tweet)  # replace urls with url. Assumes that the mention of a url is significant
    tweet = re.sub(MENTION_REGEX, '', tweet)  # remove mentions entirely
    tweet = tweet.replace('#', '')  # remove pound signs
    
    return tweet.strip()

df['title'] = df['title'].apply(clean_tweet)

def change_label(x):
    if x:
        return 'Depressed'
    else:
        return 'Not Depressed'
df['label'] = df['label'].apply(change_label)



df = df[['label', 'title']]  # Select only the 'label' and 'title' columns
df = df[~df['title'].isnull()]  # Remove rows where 'title' is null
df = df[~df['label'].isnull()]  # Remove rows where 'label' is null

print(df.shape)  # Print the shape of the DataFrame after data preprocessing
df.sample(5).T  # Display a random sample of 5 rows from the DataFrame

There are 31 duplicates found in the dataset
(10283, 2)
CPU times: user 82.2 ms, sys: 9.11 ms, total: 91.3 ms
Wall time: 131 ms


Index,313245,441006,218079,49376,600211
label,Not Depressed,Not Depressed,Not Depressed,Not Depressed,Not Depressed
title,so when can we see it??,SOrry i was having issues with my computer Te...,the sun loves me.,i start my step up internship on may 4th!! see...,got my AP with Blink on the coverr so excited...


In [6]:
# Import the necessary library to compute class weights.
from sklearn.utils.class_weight import compute_class_weight

# Identify the unique classes in the training data.
classes = np.unique(df[['label']])

print(classes)

# Calculate class weights using the 'balanced' option, which automatically adjusts for class imbalance.
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df['label'])

# Create a dictionary mapping each class to its respective class weight.
class_weights = dict(zip(classes, weights))

# Print the computed class weights to the console.
print(class_weights)

['Depressed' 'Not Depressed']
{'Depressed': 2.24814167031045, 'Not Depressed': 0.6430090045022512}


In [7]:
# Create a list of unique labels
labels_list = list(df['label'].unique())

# Initialize empty dictionaries to map labels to IDs and vice versa
label2id, id2label = dict(), dict()

# Iterate over the unique labels and assign each label an ID, and vice versa
for i, label in enumerate(labels_list):
    label2id[label] = i  # Map the label to its corresponding ID
    id2label[i] = label  # Map the ID to its corresponding label

# Print the resulting dictionaries for reference
print("Mapping of IDs to Labels:", id2label, '\n')
print("Mapping of Labels to IDs:", label2id)

Mapping of IDs to Labels: {0: 'Not Depressed', 1: 'Depressed'} 

Mapping of Labels to IDs: {'Not Depressed': 0, 'Depressed': 1}


In [8]:
ordered_weigths = [class_weights[x] for x in id2label.values()]
ordered_weigths

[0.6430090045022512, 2.24814167031045]

In [9]:
# Create a dataset from the Pandas DataFrame 'df'
dataset = Dataset.from_pandas(df)

In [10]:
from sklearn.model_selection import train_test_split
# Creating classlabels to match labels to IDs
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)

# Mapping labels to IDs
def map_label2id(example):
    example['label'] = ClassLabels.str2int(example['label'])
    return example

dataset = dataset.map(map_label2id, batched=True)

# Casting label column to ClassLabel Object
dataset = dataset.cast_column('label', ClassLabels)

# Splitting the dataset into training and testing sets using the predefined train/test split ratio.
dataset = dataset.train_test_split(test_size=1-train_fraction, shuffle=True, stratify_by_column="label")

# Extracting the training data from the split dataset.
df_train = dataset['train']

# Extracting the testing data from the split dataset.
df_test = dataset['test']

Map:   0%|          | 0/10283 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10283 [00:00<?, ? examples/s]

In [11]:
# Deleting the DataFrame 'df'
del df

# Performing garbage collection to free up memory
gc.collect()

220

In [12]:
# Create a tokenizer instance for the specified BERT model.
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=True, low_cpu_mem_usage=False)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
# Importantly, this is a simple function for preprocessing data before training a natural language processing model.
# It takes a dataset of examples as input.

def preprocess_function(examples):
    return tokenizer(examples["title"], truncation=True)

# The code below applies the preprocess_function to two dataframes, df_train and df_test.

df_train = df_train.map(preprocess_function, batched=True)
df_test = df_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/8226 [00:00<?, ? examples/s]

Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

In [14]:
# Remove the 'title' column from the training dataset.
df_train = df_train.remove_columns(['title'])

# Remove the 'title' column from the testing dataset.
df_test = df_test.remove_columns(['title'])

In [15]:
df_train

Dataset({
    features: ['label', 'Index', 'input_ids', 'attention_mask'],
    num_rows: 8226
})

In [16]:
df_test

Dataset({
    features: ['label', 'Index', 'input_ids', 'attention_mask'],
    num_rows: 2057
})

In [17]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
# Retrieve the 'input_ids' from the first row of the DataFrame 'df_train'
tokenizer.decode(df_train[0]['input_ids'])

'[CLS] is embarking on a five day weekend! Beaches, ferries, and fun! Life is good! [SEP]'

# Loading and training model

In [19]:
# Load a pre-trained BERT-based model for sequence classification.
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=len(labels_list),
    output_attentions=False,  # Set to False: Model will not return attention weights.
    output_hidden_states=False  # Set to False: Model will not return all hidden-states.
)

# Configure the mapping of class labels to their corresponding indices for later reference.
model.config.id2label = id2label  # Mapping from label indices to class labels.
model.config.label2id = label2id  # Mapping from class labels to label indices.

# Calculate and print the number of trainable parameters in millions for the model.
print(model.num_parameters(only_trainable=True) / 1e6)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


65.783042


In [20]:
# Import the 'load_metric' function from the Hugging Face datasets library to load a metric.
metric = load_metric("accuracy")

# Define a custom 'compute_metrics' function that will be used for evaluating model performance.
# This function takes 'eval_pred' as input, which is a tuple containing predicted logits and true labels.
def compute_metrics(eval_pred):
    # Unpack the 'eval_pred' tuple into 'logits' (predicted logits) and 'labels' (true labels).
    logits, labels = eval_pred
    
    # Calculate the model's predictions by selecting the class with the highest logit value.
    predictions = np.argmax(logits, axis=-1)
    
    # Use the imported metric to compute the accuracy of the model's predictions.
    accuracy = metric.compute(predictions=predictions, references=labels)
    
    # Return the computed accuracy as the evaluation metric.
    return accuracy

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [21]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has labels with different weights)
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(ordered_weigths, device=model.device).float())
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [22]:
# Create TrainingArguments to configure the training process
training_args = TrainingArguments(
    output_dir=output_dir,  # Directory to save the model checkpoints and logs
    logging_dir='./logs',  # Directory to store training logs
    num_train_epochs=num_train_epochs,  # Number of training epochs
    per_device_train_batch_size=train_batch_size,  # Batch size for training data
    per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation data
    logging_strategy='steps',  # Logging frequency during training (steps or epoch)
    logging_first_step=True,  # Log the first training step
    load_best_model_at_end=True,  # Load the best model at the end of training
    logging_steps=1,  # Log every training step (useful for debugging)
    learning_rate=learning_rate, # Set the learning rate for the optimizer.
    evaluation_strategy='epoch',  # Evaluation frequency (epoch or steps)
    warmup_steps=warmup_steps,  # Number of warmup steps for the learning rate
    weight_decay=weight_decay,  # Weight decay for regularization
    eval_steps=1,  # Evaluate every training step (useful for debugging)
    save_strategy='epoch',  # Save model checkpoints every epoch
    save_total_limit=1,  # Limit the number of saved checkpoints to save space
    report_to="mlflow",  # Log training metrics to MLflow
)

# Define the trainer:
# Instantiate the trainer class and configure its settings
trainer = WeightedTrainer(
    model=model,  # The pretrained or custom model to be trained
    args=training_args,  # TrainingArguments for configuring training
    compute_metrics=compute_metrics,  # Function for computing evaluation metrics
    train_dataset=df_train,  # Training dataset
    eval_dataset=df_test,  # Evaluation dataset
    data_collator=data_collator  # Data collator for batching and preprocessing
)

In [23]:
# Get initial metrics
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6991604566574097,
 'eval_accuracy': 0.22216820612542537,
 'eval_runtime': 3.5517,
 'eval_samples_per_second': 579.154,
 'eval_steps_per_second': 9.291}

In [24]:
# Start training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2804,0.256196,0.969373
2,0.0143,0.045942,0.99368
3,0.0045,0.02326,0.997083
4,0.0021,0.01691,0.997569
5,0.0018,0.015574,0.997569
6,0.0008,0.015796,0.997569
7,0.0007,0.017737,0.997083
8,0.0007,0.016683,0.997083
9,0.0004,0.012521,0.998055
10,0.0003,0.02374,0.998055


TrainOutput(global_step=20580, training_loss=0.05439883889622376, metrics={'train_runtime': 956.6965, 'train_samples_per_second': 171.967, 'train_steps_per_second': 21.512, 'total_flos': 2145263985565632.0, 'train_loss': 0.05439883889622376, 'epoch': 20.0})

In [25]:
# Final model evaluation
trainer.evaluate()

{'eval_loss': 0.010881882160902023,
 'eval_accuracy': 0.9985415653864852,
 'eval_runtime': 2.9298,
 'eval_samples_per_second': 702.097,
 'eval_steps_per_second': 11.264,
 'epoch': 20.0}

In [26]:
# Use the trained 'trainer' to make predictions on the 'df_test'.
outputs = trainer.predict(df_test)

# Print the metrics obtained from the prediction outputs.
print(outputs.metrics)

{'test_loss': 0.010881882160902023, 'test_accuracy': 0.9985415653864852, 'test_runtime': 2.9734, 'test_samples_per_second': 691.799, 'test_steps_per_second': 11.098}


In [27]:
# Extract the true labels from the model outputs
y_true = outputs.label_ids

# Predict the labels by selecting the class with the highest probability
y_pred = outputs.predictions.argmax(1)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_true, y_pred)

# Display accuracy and F1 score
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9985


# Saving the model and checking its performance with a sample input

In [28]:
# Directory where the model output will be saved

output_dir = "depressed-tweet-detection-distilbert"

In [29]:
trainer.save_model()
tokenizer.save_vocabulary(save_directory=f"./{output_dir}")

('./depressed-tweet-detection-distilbert/vocab.txt',)

# Prediction Using saved model

In [30]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    tweet = re.sub(r'\@\w+|\#\w+', '', tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    
    # Tokenize
    tokens = word_tokenize(tweet)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join tokens back into a sentence
    preprocessed_tweet = ' '.join(tokens)
    
    return preprocessed_tweet

# Example usage
tweets = [
    "Just had a great workout at the gym! Feeling energized and ready to conquer the day. 💪"
"Rainy days are my favorite. Cozying up with a good book and a hot cup of tea. ☔📖"
"Excited about the weekend plans! Going hiking with friends. #naturelovers"
"Feeling a bit stressed with work, but looking forward to the weekend. #FridayFeeling"
"Spent the day with loved ones. Grateful for the laughter and good times. ❤️"
"Finally got around to watching that movie everyone's been talking about. Such a great story!"
"Productive day at work! Accomplished all my tasks and feeling accomplished. #success"
"Cooked a delicious meal from scratch. Nothing beats homemade comfort food! 🍲"
"Feeling a bit tired, but overall, life is good. Gratitude for the little things."
"Planning a mini road trip with friends. Can't wait for the adventure! 🚗✨"
]

preprocessed_tweets = [preprocess_tweet(tweet) for tweet in tweets]

print(preprocessed_tweets)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['great workout gym feeling energized ready conquer day 💪rainy days favorite cozying good book hot cup tea ☔📖excited weekend plans going hiking friends bit stressed work looking forward weekend day loved ones grateful laughter good times ❤️finally got around watching movie everyones talking great storyproductive day work accomplished tasks feeling accomplished delicious meal scratch nothing beats homemade comfort food 🍲feeling bit tired overall life good gratitude little thingsplanning mini road trip friends cant wait adventure 🚗✨']


In [31]:
output_dir = "/kaggle/input/depdata"
# Pre-trained BERT model to be used
BERT_MODEL = "distilbert-base-cased"
# Make a classification pipeline and test with the sample input
pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
prediction_results = pipe(preprocessed_tweets, top_k=10)
# Extract the label and score for each prediction
for result in prediction_results:
    for prediction in result:
        label = prediction['label']
        score = prediction['score']
        print(f"Label: {label}, Score: {score}")

Label: Not Depressed, Score: 0.9939814209938049
Label: Depressed, Score: 0.006018530111759901


In [32]:
# Choose the top prediction based on the highest score
top_prediction = max(result, key=lambda x: x['score'])
print(f"\nTop Prediction: {top_prediction['label']} \nScore: {top_prediction['score']}")


Top Prediction: Not Depressed 
Score: 0.9939814209938049


# Code to collect twitter user tweets by using username and detect depression

In [33]:
import tweepy
# Set up Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [34]:
# username= input("Enter Twitter User Name: ")

In [35]:
# # Function to get tweets from a user
# def get_tweets(username, count=10):
#     # Fetch tweets from the user's timeline
#     tweets = api.user_timeline(screen_name=username, count=count, tweet_mode="extended")

#     # Extract and return the full text of each tweet
#     return [tweet.full_text for tweet in tweets]

In [36]:
# user_tweets = get_tweets(username)
# user_text = " ".join(user_tweets)

In [37]:

# # Text preprocessing

# preprocessed_tweets = preprocess_tweet(user_text)
# # Make a classification pipeline and test with the sample input
# pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
# prediction_results = pipe(preprocessed_tweets, top_k=10)
# # Extract the label and score for each prediction
# for result in prediction_results:
#     for prediction in result:
#         label = prediction['label']
#         score = prediction['score']
#         print(f"Label: {label}, Score: {score}")

# # Choose the top prediction based on the highest score
# top_prediction = max(result, key=lambda x: x['score'])
# print(f"\nTop Prediction: {top_prediction['label']} \nScore: {top_prediction['score']}")