**This Notebook contains Steps to Refernce a Bert Model and custom train it with our own Questions Dataset**




## Step 1 : Importing the necessary libraries


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
import evaluate
from transformers import DataCollatorWithPadding
import transformers
import pandas as pd
from datasets import DatasetDict, Dataset

## Step 2 : Referencing the Bert Model and Declaring the Categories of classification

In [None]:

model_path  =  "google-bert/bert-base-uncased"

tokenizer =  AutoTokenizer.from_pretrained(model_path)

id2label = {0:"Insights",1:"Data"}
label2id = {1:"Data",0:"Insights"}

model =  AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=2,id2label=id2label,label2id=label2id,)



## Step 3: Getting the Dataset Ready to train the Model

In [None]:
df= pd.read_csv(f"./weather-questions.csv")




df_insight = df[df['label']==0]
df_data = df[df['label']==1]


df['label'] = df['label'].astype(int)



df_insight_sample = df_insight.sample(len(df_insight), random_state=42)
df_data_sample = df_data.sample(len(df_data), random_state=42)


#print(df_data)


df_insight_sample = df_insight_sample.assign(isdata=False)
df_insight_sample = df_insight_sample.drop('label',axis=1)
df_data_sample = df_data_sample.assign(isdata=True)
df_data_sample = df_data_sample.drop('label',axis=1)

# Concatenate the samples to create a new balanced dataset
balanced_df = pd.concat([df_insight_sample, df_data_sample])
balanced_df.columns = ['text', 'labels']

# convert labels column to int
balanced_df['labels'] = balanced_df['labels'].astype(int)

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_ds = Dataset.from_pandas(balanced_df)


# Split into train, validation, and test sets (e.g., 70% train, 15% validation, 15% test)
train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
train_df = balanced_df[:train_size]
valid_df = balanced_df[train_size:train_size + valid_size]
test_df = balanced_df[train_size + valid_size:]

# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)
test_ds = Dataset.from_pandas(test_df)


# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

print(test_df)


## Step 4: Disbale the unnesecarry parameters to Make the model much lighter / Mapping and Tokeninzing the Training dataset

In [None]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_data = dataset_dict.map(preprocess_function,batched=True)

data_collator =  DataCollatorWithPadding(tokenizer=tokenizer)

## Step4 : Defining the Compute Metrics and Setting up accruacy levels to analyse the output of the model

In [5]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

## Step5 : Declaring the traning parameters and saving the trained model

In [6]:
lr = 2e-4
batch_size = 8
num_epochs = 10


training_args =  TrainingArguments(
    output_dir = "bert-intent-classifier_cbase",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
    

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## Step 7 : Testing the trained model with questions 

In [None]:
import pandas as pd
from datasets import Dataset
import numpy as np

# Sample text inputs for testing
test_texts = ["Can you analyze why user engagement dropped after 14th of Feb 2025 ?", "What are all the factors which were influencing the temprature on the city with postal code 02047 ?"]

# Create a Pandas DataFrame (matching the format of your dataset)
test_df = pd.DataFrame({"text": test_texts})

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

predictions = trainer.predict(tokenized_test_dataset)
logits = predictions.predictions 

# Get class predictions
predicted_classes = np.argmax(logits, axis=1)

# Map class IDs back to labels (ensure this matches your training labels)
id2label = {0: "Insights", 1: "Data"}  
predicted_labels = [id2label[label] for label in predicted_classes]

# Print results
for text, label in zip(test_texts, predicted_labels):
    print(f"Text: {text}\nPredicted Label: {label}\n")



### Step 8: After testing the model , now we can push this to Hugging face

In [None]:
model.push_to_hub('Azar-J/question-classifier')

### Now the Model has been pushed and now will be available for use and hosting them in Snowflake Model Registry