In [2]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import pipeline

model_id = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2, # number of  possible outcomes
)

In [3]:
from datasets import load_dataset
data_files={
    "train":"datasets/train.jsonl",
    "validation":"datasets/val.jsonl",
    "test":"datasets/test.jsonl"
}


dataset = load_dataset("json", data_files=data_files)


In [4]:
dataset["test"] = dataset["test"].filter(
    lambda example: isinstance(example["text"], str) and example["text"].strip() != ""
)


# Label Ouputs as Labels

In [14]:
label_map = {"non-suicide": 0, "suicide": 1}

def encode_labels(example):
    if example.get("class") in label_map:
        example["label"] = label_map[example["class"]]
    else:
        # Skip or assign a default, or log it
        example["label"] = -1  # Optional: handle as an unknown class
    return example

for split in ["train", "validation","test"]:
    dataset[split] = dataset[split].map(encode_labels)


Map:   0%|          | 0/7102 [00:00<?, ? examples/s]

Map:   0%|          | 0/887 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

In [15]:
print(dataset)
print(dataset["test"].features)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 7102
    })
    validation: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 887
    })
    test: Dataset({
        features: ['id', 'text', 'class', 'label'],
        num_rows: 889
    })
})
{'id': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'class': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


# Get rid of empty strings 

In [30]:
def is_not_empty(example):
    text = example.get("text", "")  # Safely get "text", defaulting to empty string if it's missing or None
    return isinstance(text, str) and text.strip() != ""

for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].filter(is_not_empty)




Filter:   0%|          | 0/7102 [00:00<?, ? examples/s]

Filter:   0%|          | 0/887 [00:00<?, ? examples/s]

Filter:   0%|          | 0/889 [00:00<?, ? examples/s]

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    # Initialize a list to store tokenized results
    tokenized_results = []
    
    for text in examples['text']:
        try:
            
            tokenized_results.append(tokenizer(text, padding='max_length', truncation=True,max_length=32))
        except Exception as e:
            # Print the problematic text and continue with the next one
            print(f"Error tokenizing: {text} - Error: {e}")
            tokenized_results.append(None)  # Append None for problematic cases

    # Return the tokenized results
    return {"input_ids": [result['input_ids'] if result is not None else [] for result in tokenized_results],
            "attention_mask": [result['attention_mask'] if result is not None else [] for result in tokenized_results]}


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Filter out any examples that were returned as empty dictionaries 
tokenized_datasets = tokenized_datasets.filter(lambda example: example != {})

tokenized_datasets = tokenized_datasets.rename_columns({"label": "labels"})  # Ensure the label column is named "labels"


Map:   0%|          | 0/7100 [00:00<?, ? examples/s]

Map:   0%|          | 0/887 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/887 [00:00<?, ? examples/s]

Filter:   0%|          | 0/889 [00:00<?, ? examples/s]

In [32]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for model checkpoints
    # evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=1,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            
    
)

# Define the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],  
    eval_dataset=tokenized_datasets["validation"],  
)


In [33]:
trainer.train()

Step,Training Loss
500,0.3876
1000,0.3046
1500,0.2844


TrainOutput(global_step=1775, training_loss=0.31353918800891284, metrics={'train_runtime': 106.6185, 'train_samples_per_second': 66.593, 'train_steps_per_second': 16.648, 'total_flos': 58782408153600.0, 'train_loss': 0.31353918800891284, 'epoch': 1.0})

In [34]:
trainer.evaluate(tokenized_datasets["validation"])

{'eval_loss': 0.3800152838230133,
 'eval_runtime': 0.9903,
 'eval_samples_per_second': 895.692,
 'eval_steps_per_second': 112.088,
 'epoch': 1.0}

In [35]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.3205277621746063,
 'eval_runtime': 1.0402,
 'eval_samples_per_second': 854.614,
 'eval_steps_per_second': 107.668,
 'epoch': 1.0}

In [36]:
trainer.save_model("my_suicide_buddy")
tokenizer.save_pretrained("my_suicide_buddy")


('my_suicide_buddy/tokenizer_config.json',
 'my_suicide_buddy/special_tokens_map.json',
 'my_suicide_buddy/vocab.txt',
 'my_suicide_buddy/added_tokens.json',
 'my_suicide_buddy/tokenizer.json')

# ReLabel Labels back to Designated Output Choice

In [43]:
from transformers import pipeline

# Define the label map to map "NEGATIVE" and "POSITIVE" to your original labels
label_map = {"NEGATIVE": "non-suicide", "POSITIVE": "suicide"}

# Load the model and tokenizer
pipe_budy = pipeline("text-classification", model="my_suicide_buddy", tokenizer=tokenizer)

# Define the function to interpret the output
def interpret_output(output):
    # The output['label'] will be either 'NEGATIVE' or 'POSITIVE'
    label = output[0]['label']
    return {"label": label_map.get(label, "Unknown"), "score": output[0]['score']}

# Take input from the user
prompt = input("How are you feeling:")

# Print the prompt
print(prompt)

# Get the prediction and interpret it
prediction = pipe_budy(prompt)

# Interpret the prediction using the label_map
interpreted_prediction = interpret_output(prediction)

# Print the result
print(interpreted_prediction)


Device set to use cuda:0


i am feelign really stresesd i might do something to my self
{'label': 'suicide', 'score': 0.9840261936187744}
