In [1]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import pipeline

model_id = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=5, # number of  possible outcomes
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_dataset
data_files={
    "train":"dataset/train.jsonl",
    "validation":"dataset/val.jsonl",
    "test":"dataset/test.jsonl"
}


dataset = load_dataset("json", data_files=data_files)


In [3]:
dataset["test"] = dataset["test"].filter(
    lambda example: isinstance(example["text"], str) and example["text"].strip() != ""
)

In [4]:
label_map = {
    "Normal": 0,
    "Bipolar": 1,
    "Anxiety": 2,
    "Suicidal": 3,
    "Depression": 4
}

def encode_labels(example):
    if example.get("class") in label_map:
        example["label"] = label_map[example["class"]]
    else:
        example["label"] = -1  # Optional: handle unknown labels
    return example

for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].map(encode_labels)


# Get rid of invalid labels

In [5]:
def filter_invalid(example):
    return example["label"] != -1

for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].filter(filter_invalid)


In [6]:
print(dataset)
print(dataset["test"].features)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 15935
    })
    validation: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 1997
    })
    test: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 1994
    })
})
{'id': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'class': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


In [7]:
def is_not_empty(example):
    text = example.get("text", "")  # Safely get "text", defaulting to empty string if it's missing or None
    return isinstance(text, str) and text.strip() != ""

for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].filter(is_not_empty)




In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    # Initialize a list to store tokenized results
    tokenized_results = []
    
    for text in examples['text']:
        try:
            
            tokenized_results.append(tokenizer(text, padding='max_length', truncation=True,max_length=32))
        except Exception as e:
            # Print the problematic text and continue with the next one
            print(f"Error tokenizing: {text} - Error: {e}")
            tokenized_results.append(None)  # Append None for problematic cases

    # Return the tokenized results
    return {"input_ids": [result['input_ids'] if result is not None else [] for result in tokenized_results],
            "attention_mask": [result['attention_mask'] if result is not None else [] for result in tokenized_results]}


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Filter out any examples that were returned as empty dictionaries 
tokenized_datasets = tokenized_datasets.filter(lambda example: example != {})

tokenized_datasets = tokenized_datasets.rename_columns({"label": "labels"})  # Ensure the label column is named "labels"


In [9]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for model checkpoints
    # evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=1e-5,              # learning rate
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=1,              # number of training epochs
    weight_decay=0.1,               # strength of weight decay
    save_strategy ="no",
    logging_dir="./logs",            
    
)

# Define the Trainer
trainer = Trainer(
    model=model,                          
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],  
    eval_dataset=tokenized_datasets["validation"],  
)


In [10]:
trainer.train()

Step,Training Loss
500,0.8857
1000,0.6534
1500,0.5741
2000,0.538
2500,0.5415
3000,0.5155
3500,0.4947


TrainOutput(global_step=3982, training_loss=0.589631107930021, metrics={'train_runtime': 196.2545, 'train_samples_per_second': 81.16, 'train_steps_per_second': 20.29, 'total_flos': 131878350574080.0, 'train_loss': 0.589631107930021, 'epoch': 1.0})

In [11]:
trainer.evaluate(tokenized_datasets["validation"])

{'eval_loss': 0.5229761600494385,
 'eval_runtime': 2.2342,
 'eval_samples_per_second': 893.397,
 'eval_steps_per_second': 111.898,
 'epoch': 1.0}

In [12]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.5061888694763184,
 'eval_runtime': 2.2004,
 'eval_samples_per_second': 906.188,
 'eval_steps_per_second': 113.614,
 'epoch': 1.0}

In [13]:
trainer.save_model("mental_health_bud")
tokenizer.save_pretrained("mental_health_bud")


('mental_health_bud/tokenizer_config.json',
 'mental_health_bud/special_tokens_map.json',
 'mental_health_bud/vocab.txt',
 'mental_health_bud/added_tokens.json',
 'mental_health_bud/tokenizer.json')

In [14]:
from transformers import pipeline

# Updated label map
label_map = {
    0: "normal",
    1: "bipolar",
    2: "anxiety",
    3: "suicidal",
    4: "depression"
}

# Load the model pipeline
pipe_budy = pipeline("text-classification", model="mental_health_bud", tokenizer=tokenizer)

# Function to interpret output
def interpret_output(output):
    label_str = output[0]['label']  # e.g., "LABEL_4"
    label_index = int(label_str.replace("LABEL_", ""))  # safely extract the index
    readable_label = label_map.get(label_index, "Unknown")
    return {
        "label": readable_label,
        "label_index": label_index,
        "score": round(output[0]['score'], 4)  # optional rounding
    }
# Take input
prompt = input("How are you feeling: ")
print(prompt)

# Predict
prediction = pipe_budy(prompt)

# Interpret
interpreted_prediction = interpret_output(prediction)

# Show result
print(interpreted_prediction)
# print(prediction)


Device set to use cuda:0


i dont know how am feeling , it could be bad tho
{'label': 'normal', 'label_index': 0, 'score': 0.5374}
