<a href="https://colab.research.google.com/github/aditya-hubli/BERT-GovPII/blob/main/Finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Parameters
num_samples = 400
valid_length = 16

# Generate 16-digit Aadhaar numbers
def generate_valid_aadhaar_numbers(n):
    return [str(np.random.randint(10**(valid_length-1), 10**valid_length)) for _ in range(n)]

# Generate non-16-digit numbers (varying lengths)
def generate_invalid_numbers(n, valid_length):
    invalid_numbers = []
    lengths = [i for i in range(1, valid_length)] + [i for i in range(valid_length + 1, valid_length + 5)]
    for _ in range(n):
        length = np.random.choice(lengths)
        num = ''.join(np.random.choice(list('0123456789'), length))
        invalid_numbers.append(num)
    return invalid_numbers

# Create dataset
valid_aadhaar_numbers = generate_valid_aadhaar_numbers(num_samples // 2)
invalid_numbers = generate_invalid_numbers(num_samples // 2, valid_length)

# Combine into a DataFrame
data = {
    'number': valid_aadhaar_numbers + invalid_numbers,
    'label': ['Aadhaar'] * (num_samples // 2) + ['Not Aadhaar'] * (num_samples // 2)
}

df = pd.DataFrame(data)

# Shuffle the DataFrame
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Save to CSV
df.to_csv('/content/aadhaar_dataset.csv', index=False)

print("Dataset created and saved as 'aadhaar_dataset.csv'.")


Dataset created and saved as 'aadhaar_dataset.csv'.


In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess dataset
ROOT_DIR = '/content/'
df_org = pd.read_csv(ROOT_DIR + 'aadhaar_dataset.csv', encoding='utf-8')
df_org = df_org.sample(frac=1.0, random_state=42)

# Encode labels
labels = df_org['label'].unique().tolist()
labels = [s.strip().lower() for s in labels]
NUM_LABELS = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

df_org["labels"] = df_org['label'].map(lambda x: label2id.get(x.strip().lower(), -1))

# Tokenize the text data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

names_list = df_org['number'].astype(str).tolist()

# Tokenize the text data
tokenized_data = tokenizer(
    names_list,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Convert tokenized data to dictionary format
inputs_dict = {
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask']
}

# Convert to TensorDataset
labels_tensor = torch.tensor(df_org['labels'].values, dtype=torch.long)
dataset = TensorDataset(inputs_dict['input_ids'], inputs_dict['attention_mask'], labels_tensor)

# Create DataLoader
train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=32
)

# Load model and move to device
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Loop
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to device
        input_ids, attention_mask, labels = [t.to(device) for t in batch]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Check data types for debugging
        print(f"Labels dtype: {labels.dtype}, Loss dtype: {loss.dtype}")

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {loss.item()}")

# Save the trained model and tokenizer
model.save_pretrained(ROOT_DIR + "trained_aadhar_model")
tokenizer.save_pretrained(ROOT_DIR + "trained_aadhar_tokenizer")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Epoch 1/3 completed. Loss: 0.5631991028785706
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32


('/content/trained_aadhar_tokenizer/tokenizer_config.json',
 '/content/trained_aadhar_tokenizer/special_tokens_map.json',
 '/content/trained_aadhar_tokenizer/vocab.txt',
 '/content/trained_aadhar_tokenizer/added_tokens.json',
 '/content/trained_aadhar_tokenizer/tokenizer.json')

In [None]:
# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(ROOT_DIR + "trained_aadhar_model")
tokenizer = BertTokenizerFast.from_pretrained(ROOT_DIR + "trained_aadhar_tokenizer")

# Set model to evaluation mode
model.eval()

# Test with a sample input
test_numbers = ["1234567890123456", "1234567890", "9876543210123456", "abcd1234"]
inputs = tokenizer(test_numbers, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_classes = torch.argmax(logits, dim=1).cpu().tolist()

# Map predicted classes to labels
predicted_labels = [id2label[pred] for pred in predicted_classes]
for num, label in zip(test_numbers, predicted_labels):
    print(f"Number: {num}, Predicted Class: {label}")


Number: 1234567890123456, Predicted Class: aadhaar
Number: 1234567890, Predicted Class: not aadhaar
Number: 9876543210123456, Predicted Class: aadhaar
Number: abcd1234, Predicted Class: not aadhaar


In [None]:
import pandas as pd
import random
import string

def generate_valid_pan():
    letters = ''.join(random.choices(string.ascii_uppercase, k=5))
    digits = ''.join(random.choices(string.digits, k=4))
    letter_end = random.choice(string.ascii_uppercase)
    return f"{letters}{digits}{letter_end}"

def generate_invalid_pan():
    invalid_length = random.choice([8, 12])  # Invalid length
    invalid_characters = ''.join(random.choices(string.ascii_letters + string.digits, k=invalid_length))
    return invalid_characters

# Create valid PAN numbers
num_valid = 200
valid_pans = [generate_valid_pan() for _ in range(num_valid)]

# Create invalid PAN numbers
num_invalid = 200
invalid_pans = [generate_invalid_pan() for _ in range(num_invalid)]

# Create DataFrame
df_valid = pd.DataFrame({'number': valid_pans, 'label': ['PAN'] * num_valid})
df_invalid = pd.DataFrame({'number': invalid_pans, 'label': ['Not PAN'] * num_invalid})

df_pans = pd.concat([df_valid, df_invalid], ignore_index=True)
df_pans = df_pans.sample(frac=1.0, random_state=42)  # Shuffle the dataset

# Save to CSV
df_pans.to_csv('/content/pan_dataset.csv', index=False)


In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess dataset
ROOT_DIR = '/content/'
df_org = pd.read_csv(ROOT_DIR + 'pan_dataset.csv', encoding='utf-8')
df_org = df_org.sample(frac=1.0, random_state=42)

# Encode labels
labels = df_org['label'].unique().tolist()
labels = [s.strip().lower() for s in labels]
NUM_LABELS = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Ensure no invalid labels (e.g., -1)
df_org["labels"] = df_org['label'].map(lambda x: label2id.get(x.strip().lower(), -1))
df_org = df_org[df_org['labels'] != -1]  # Remove rows with invalid labels

# Tokenize the text data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
names_list = df_org['number'].astype(str).tolist()
tokenized_data = tokenizer(
    names_list,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Convert tokenized data to dictionary format
inputs_dict = {
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask']
}

# Convert to TensorDataset
labels_tensor = torch.tensor(df_org['labels'].values, dtype=torch.long)
dataset = TensorDataset(inputs_dict['input_ids'], inputs_dict['attention_mask'], labels_tensor)

# Create DataLoader
train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=32
)

# Load model and move to device
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Loop
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        # Move batch to device
        input_ids, attention_mask, labels = [t.to(device) for t in batch]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} completed. Average Loss: {avg_loss}")

# Save the trained model and tokenizer
model.save_pretrained(ROOT_DIR + "trained_pan_model")
tokenizer.save_pretrained(ROOT_DIR + "trained_pan_tokenizer")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 completed. Average Loss: 0.48486852187376756
Epoch 2/3 completed. Average Loss: 0.2175389161476722
Epoch 3/3 completed. Average Loss: 0.13110008205358797


('/content/trained_pan_tokenizer/tokenizer_config.json',
 '/content/trained_pan_tokenizer/special_tokens_map.json',
 '/content/trained_pan_tokenizer/vocab.txt',
 '/content/trained_pan_tokenizer/added_tokens.json',
 '/content/trained_pan_tokenizer/tokenizer.json')

In [None]:
# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(ROOT_DIR + "trained_pan_model")
tokenizer = BertTokenizerFast.from_pretrained(ROOT_DIR + "trained_pan_tokenizer")

# Set model to evaluation mode
model.eval()

# Test with sample inputs
test_numbers = ["ABCDE1234F", "1234567890", "PQRST6789L", "abcd1234", "A1B2C3D4E5"]
inputs = tokenizer(test_numbers, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_classes = torch.argmax(logits, dim=1).cpu().tolist()

# Map predicted classes to labels
predicted_labels = [id2label[pred] for pred in predicted_classes]
for num, label in zip(test_numbers, predicted_labels):
    print(f"Number: {num}, Predicted Class: {label}")


Number: ABCDE1234F, Predicted Class: pan
Number: 1234567890, Predicted Class: pan
Number: PQRST6789L, Predicted Class: pan
Number: abcd1234, Predicted Class: pan
Number: A1B2C3D4E5, Predicted Class: not pan


In [None]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
pip install docx

Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx
  Building wheel for docx (setup.py) ... [?25l[?25hdone
  Created wheel for docx: filename=docx-0.2.4-py3-none-any.whl size=53892 sha256=9ae8366be051d54b0e49582983e03c30aa94f1c472707746bbcdbdad5ba9d930
  Stored in directory: /root/.cache/pip/wheels/81/f5/1d/e09ba2c1907a43a4146d1189ae4733ca1a3bfe27ee39507767
Successfully built docx
Installing collected packages: docx
Successfully installed docx-0.2.4


In [None]:
pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
import torch
import re
import pandas as pd
import PyPDF2
import csv
import docx
from transformers import BertTokenizerFast, BertForSequenceClassification

# Define the models and tokenizers for each type of detection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load name detection model and tokenizer
name_model = BertForSequenceClassification.from_pretrained('/content/trained_name_model')
name_tokenizer = BertTokenizerFast.from_pretrained('/content/trained_name_tokenizer')
name_model.to(device)

# Load Aadhaar detection model and tokenizer
aadhaar_model = BertForSequenceClassification.from_pretrained('/content/trained_aadhar_model')
aadhaar_tokenizer = BertTokenizerFast.from_pretrained('/content/trained_aadhar_tokenizer')
aadhaar_model.to(device)

# Load PAN detection model and tokenizer
pan_model = BertForSequenceClassification.from_pretrained('/content/trained_pan_model')
pan_tokenizer = BertTokenizerFast.from_pretrained('/content/trained_pan_tokenizer')
pan_model.to(device)

def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
    return text

def extract_text_from_csv(csv_file):
    text = ""
    try:
        with open(csv_file, newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                text += ' '.join(row) + '\n'
    except Exception as e:
        print(f"Failed to extract text from CSV: {e}")
    return text

def extract_text_from_txt(txt_file):
    text = ""
    try:
        with open(txt_file, 'r', encoding='utf-8') as file:
            text = file.read()
    except Exception as e:
        print(f"Failed to extract text from TXT: {e}")
    return text

def extract_text_from_docx(docx_file):
    text = ""
    try:
        doc = docx.Document(docx_file)
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
    except Exception as e:
        print(f"Failed to extract text from DOCX: {e}")
    return text

def extract_text(file_path):
    if file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith('.csv'):
        return extract_text_from_csv(file_path)
    elif file_path.endswith('.txt'):
        return extract_text_from_txt(file_path)
    elif file_path.endswith('.docx'):
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type")

def predict_with_model(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

def detect_pii(text):
    # Predict with name model
    name_prediction = predict_with_model(name_model, name_tokenizer, text)

    # Predict with Aadhaar model
    aadhaar_prediction = predict_with_model(aadhaar_model, aadhaar_tokenizer, text)

    # Predict with PAN model
    pan_prediction = predict_with_model(pan_model, pan_tokenizer, text)

    # Map predictions to human-readable labels
    pii_data = {
        "Detected Names": "Indian" if name_prediction == 0 else "Non Indian",
        "Aadhaar Prediction": "Aadhaar" if aadhaar_prediction == 0 else "Non Aadhaar",
        "PAN Prediction": "PAN" if pan_prediction == 0 else "Non PAN"
    }

    return pii_data

# Example usage
if __name__ == "__main__":
    # Extract text from a document
    file_path = '/testingdocs.docx'
    text = extract_text(file_path)

    # Detect PII
    pii_data = detect_pii(text)
    print(pii_data)


{'Detected Names': 'Indian', 'Aadhaar Prediction': 'Non Aadhaar', 'PAN Prediction': 'PAN'}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
import os

# List of folders to copy
folders = [
    'trained_aadhar_model',
    'trained_aadhar_tokenizer',
    'trained_name_model',
    'trained_name_tokenizer',
    'trained_pan_model',
    'trained_pan_tokenizer'
]

# Define destination base path
dest_base = '/content/drive/My Drive/'

# Copy each folder
for folder in folders:
    src_dir = os.path.join('/content', folder)
    dest_dir = os.path.join(dest_base, folder)

    # Copy directory to Google Drive
    shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)


In [None]:
import random
import csv

# Lists of Indian and Non-Indian male and female names
indian_male_names = ["Rajesh", "Arjun", "Vikram", "Anil", "Suresh", "Ravi", "Sanjay", "Amit", "Manoj", "Rakesh"]
indian_female_names = ["Sita", "Lakshmi", "Priya", "Meera", "Anjali", "Nandini", "Sunita", "Asha", "Kavita", "Rekha"]
non_indian_male_names = ["John", "David", "Michael", "James", "Robert", "William", "Joseph", "Charles", "Thomas", "Daniel"]
non_indian_female_names = ["Emma", "Olivia", "Sophia", "Isabella", "Mia", "Ava", "Emily", "Charlotte", "Amelia", "Abigail"]

# List of common non-name words
non_names = ["do", "from", "which", "the", "and", "but", "if", "then", "on", "in"]

# Function to generate the dataset
def generate_dataset(file_name, num_records):
    dataset = []

    # Generate name records
    for name in indian_male_names + indian_female_names:
        dataset.append([name, "true", "Indian"])

    for name in non_indian_male_names + non_indian_female_names:
        dataset.append([name, "true", "Non-Indian"])

    # Generate non-name records
    for word in non_names:
        dataset.append([word, "false", ""])

    # Randomly select records to reach the desired number
    while len(dataset) < num_records:
        name_type = random.choice(["indian_male", "indian_female", "non_indian_male", "non_indian_female", "non_name"])

        if name_type == "indian_male":
            dataset.append([random.choice(indian_male_names), "true", "Indian"])
        elif name_type == "indian_female":
            dataset.append([random.choice(indian_female_names), "true", "Indian"])
        elif name_type == "non_indian_male":
            dataset.append([random.choice(non_indian_male_names), "true", "Non-Indian"])
        elif name_type == "non_indian_female":
            dataset.append([random.choice(non_indian_female_names), "true", "Non-Indian"])
        else:
            dataset.append([random.choice(non_names), "false", ""])

    # Shuffle the dataset
    random.shuffle(dataset)

    # Save the dataset to a CSV file
    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["name", "truthvalue", "race"])
        writer.writerows(dataset)

    print(f"Dataset saved to {file_name}")

# Generate a dataset with 400 records
generate_dataset("name_non_name_dataset.csv", 400)


Dataset saved to name_non_name_dataset.csv


In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess dataset
ROOT_DIR = '/content/'
df_org = pd.read_csv(ROOT_DIR + 'name_non_name_dataset.csv', encoding='utf-8')
df_org = df_org.sample(frac=1.0, random_state=42)

# Encode labels for truthvalue and race
df_org['truthvalue'] = df_org['truthvalue'].apply(lambda x: 1 if x else 0)
df_org['race'] = df_org['race'].fillna('').apply(lambda x: 1 if x.strip().lower() == 'indian' else 0)

# Combine truthvalue and race to create a single label with 4 classes:
# 0: non-name, non-indian
# 1: non-name, indian
# 2: name, non-indian
# 3: name, indian
df_org["labels"] = df_org['truthvalue'] * 2 + df_org['race']

NUM_LABELS = 4
id2label = {0: 'non-name, non-indian', 1: 'non-name, indian', 2: 'name, non-indian', 3: 'name, indian'}
label2id = {v: k for k, v in id2label.items()}

# Tokenize the text data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

names_list = df_org['name'].astype(str).tolist()

tokenized_data = tokenizer(
    names_list,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Convert tokenized data to TensorDataset
labels_tensor = torch.tensor(df_org['labels'].values, dtype=torch.long)
dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels_tensor)

# Create DataLoader
train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=32
)

# Load model and move to device
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Loop
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to device
        input_ids, attention_mask, labels = [t.to(device) for t in batch]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Check data types for debugging
        print(f"Labels dtype: {labels.dtype}, Loss dtype: {loss.dtype}")

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {loss.item()}")

# Save the trained model and tokenizer
model.save_pretrained(ROOT_DIR + "trained_name_model_new")
tokenizer.save_pretrained(ROOT_DIR + "trained_name_tokenizer_new")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Epoch 1/3 completed. Loss: 0.8035909533500671
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32
Labels dtype: torch.int64, Loss dtype: torch.float32


('/content/trained_name_tokenizer_new/tokenizer_config.json',
 '/content/trained_name_tokenizer_new/special_tokens_map.json',
 '/content/trained_name_tokenizer_new/vocab.txt',
 '/content/trained_name_tokenizer_new/added_tokens.json',
 '/content/trained_name_tokenizer_new/tokenizer.json')

In [None]:
def predict_race(name):
    # Load the saved model and tokenizer
    model = BertForSequenceClassification.from_pretrained('/content/trained_name_model_new', num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
    tokenizer = BertTokenizerFast.from_pretrained('/content/trained_name_tokenizer_new')

    # Move the model to the appropriate device
    model.to(device)

    inputs = tokenizer(name, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    return id2label[predicted_class]

# Test with a new name
new_name = "Michael ross"
predicted_race = predict_race(new_name)
print(f"The predicted race for {new_name} is {predicted_race}.")

The predicted race for Michael ross is name, non-indian.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil
import os

# List of folders to copy
folders = [
    'trained_name_model_new',
    'trained_name_tokenizer_new',
]

# Define destination base path
dest_base = '/content/drive/My Drive/'

# Copy each folder
for folder in folders:
    src_dir = os.path.join('/content', folder)
    dest_dir = os.path.join(dest_base, folder)

    # Copy directory to Google Drive
    shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)