In [2]:
# Step 1: Data Cleaning and Preprocessing
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Load data
df = pd.read_excel('project.xlsx')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode 'Domain' into numerical labels
label_encoder = LabelEncoder()
df['Domain'] = label_encoder.fit_transform(df['Domain'])



In [3]:
# Concatenate text data
df['text'] = df['Name'] + ' ' + df['Description'] + ' ' + df['Column Names'] + ' ' + df['Keywords'] 

# Convert to lowercase
df['processed'] = df['text']


In [4]:
# Step 3: Model Training
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed'], df['Domain'], test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Tokenize input data
X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=256, return_tensors='pt')
X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=256, return_tensors='pt')

# Convert labels to tensor
y_train_tensor = torch.tensor(list(y_train))
y_test_tensor = torch.tensor(list(y_test))

# Fine-tuning BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-05)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Train the model
model.train()

# Define batch size
batch_size = 8

for epoch in range(3):  # Adjust the number of epochs as needed
    optimizer.zero_grad()
    for i in range(0, len(X_train_tokens['input_ids']), batch_size):
        batch_input_ids = X_train_tokens['input_ids'][i:i+batch_size].to(device)
        batch_attention_mask = X_train_tokens['attention_mask'][i:i+batch_size].to(device)
        batch_labels = y_train_tensor[i:i+batch_size].unsqueeze(1).to(device)

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.013412313535809517
Epoch 2, Loss: 0.004588530398905277
Epoch 3, Loss: 0.02595432661473751


In [6]:
# Step 4: Model Evaluation
from sklearn.metrics import accuracy_score, classification_report
# Evaluation mode
model.eval()

# Predictions
with torch.no_grad():
    predictions = []
    for i in range(0, len(X_test_tokens['input_ids']), batch_size):
        batch_input_ids = X_test_tokens['input_ids'][i:i+batch_size].to(device)
        batch_attention_mask = X_test_tokens['attention_mask'][i:i+batch_size].to(device)

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)

# Calculate metrics
accuracy = accuracy_score(y_test_tensor.cpu(), predictions)
print("Accuracy:", accuracy)
print(classification_report(y_test_tensor.cpu(), predictions, target_names=label_encoder.classes_))

Accuracy: 0.9883333333333333
              precision    recall  f1-score   support

    Business       1.00      0.99      1.00       120
   Economics       1.00      1.00      1.00       113
   Education       1.00      0.95      0.97       112
      Health       0.95      1.00      0.97       124
      Sports       1.00      1.00      1.00       131

    accuracy                           0.99       600
   macro avg       0.99      0.99      0.99       600
weighted avg       0.99      0.99      0.99       600



In [11]:
# Predict domain for new dataset info
def predict_domain(dataset_info):
    
    # Tokenize input text
    inputs = tokenizer(dataset_info, padding=True, truncation=True, max_length=256, return_tensors="pt")

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    predicted_label_id = torch.argmax(outputs.logits).item()

    # Decode predicted label
    predicted_domain = label_encoder.inverse_transform([predicted_label_id])

    return predicted_domain[0]


In [12]:
# Example usage
new_dataset_info = "Economic indicators from various countries, including GDP, inflation rate, and unemployment rate., Economics, global indicators, Country, Year, GDP, Inflation Rate, Unemployment Rate, Trade Balance, Currency, Global Economic Indicators"
predicted_domain = predict_domain(new_dataset_info)
print("Predicted Domain:", predicted_domain)

Predicted Domain: Economics


In [None]:
import os

# Save the model
model.save_pretrained("bert_model")

# Create a directory for tokenizer files
os.makedirs("bert_model_tokenizer", exist_ok=True)

# Save the tokenizer
tokenizer.save_pretrained("bert_model_tokenizer")

In [14]:
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']