## Models

### Importing packages

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

### Importing cleaned dataset

In [2]:
df = pd.read_csv('data/clean_resume_dataset.csv')
df

Unnamed: 0,Category,Resume,clean_text
0,Accountant,education omba executive leadership university...,education omba executive leadership university...
1,Accountant,howard gerrard accountant deyjobcom birmingham...,howard gerrard accountant deyjobcom birmingham...
2,Accountant,kevin frank senior accountant inforesumekraftc...,kevin frank senior accountant inforesumekraftc...
3,Accountant,place birth nationality olivia ogilvy accounta...,place birth nationality olivia ogilvy accounta...
4,Accountant,stephen greet cpa senior accountant 9 year exp...,stephen greet cpa senior accountant 9 year exp...
...,...,...,...
12246,Testing,Computer Skills: â¢ Proficient in MS office (...,Computer Skills Proficient in MS office Word B...
12247,Testing,â Willingness to accept the challenges. â ...,Willingness to accept the challenges Positive ...
12248,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",PERSONAL SKILLS Quick learner Eagerness to lea...
12249,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power Po...


In [None]:
# --- Step 4: Train-Test Split (First Split) ---
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'],
    df['label'],
    test_size=0.1,  # 10% for testing
    random_state=42,
    stratify=df['label']  # Ensures class balance
)

# --- Step 5: Tokenize Text for BERT ---
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [None]:
# --- Step 6: Cross-Validation Setup (on Training Set) ---
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Tokenize the data for BERT
    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='tf'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='tf'
    )

    # Convert labels to TensorFlow format
    y_train_tf = tf.convert_to_tensor(y_train_fold)
    y_val_tf = tf.convert_to_tensor(y_val_fold)

    # --- Step 8: Load BERT Model ---
    model = TFBertForSequenceClassification.from_pretrained(
        'bert-base-uncased', 
        num_labels=num_labels
    )

    # --- Step 9: Compile Model ---
    optimizer = Adam(learning_rate=3e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # --- Step 10: Train the Model ---
    model.fit(
        {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
        y_train_tf,
        validation_data=(
            {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']},
            y_val_tf
        ),
        epochs=2,             # increase to 3-5 for better performance
        batch_size=16
    )

    # --- Step 11: Evaluate on Validation Set ---
    pred_logits = model.predict({
        'input_ids': val_encodings['input_ids'],
        'attention_mask': val_encodings['attention_mask']
    }).logits

    pred_labels = np.argmax(pred_logits, axis=1)

    # --- Step 12: Record Metrics ---
    accuracies.append(accuracy_score(y_val_fold, pred_labels))
    report = classification_report(y_val_fold, pred_labels, target_names=label_encoder.classes_, output_dict=True)
    precisions.append(report['macro avg']['precision'])
    recalls.append(report['macro avg']['recall'])
    f1_scores.append(report['macro avg']['f1-score'])

    print(f"Fold {fold_no} - Accuracy: {accuracies[-1]}")
    fold_no += 1

# --- Step 13: Output Final Metrics ---
print("\n--- Final K-Fold Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-Score: {np.mean(f1_scores):.4f}")



In [None]:
# --- Step 14: Final Evaluation on Test Set ---
# Tokenize the test data
test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)

# Convert test labels
y_test_tf = tf.convert_to_tensor(y_test)

# Evaluate model on the test set
pred_logits_test = model.predict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask']
}).logits

pred_labels_test = np.argmax(pred_logits_test, axis=1)

print("\n--- Final Test Set Evaluation ---")
print("Test Accuracy:", accuracy_score(y_test, pred_labels_test))
print("\nTest Classification Report:\n")
print(classification_report(y_test, pred_labels_test, target_names=label_encoder.classes_))