In [45]:
!pip install google-cloud-storage pandas



In [59]:
import os
import pandas as pd
from google.cloud import storage

# Set the path to your service account JSON key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/finsight-ai-442823-74c796148fec.json"

# Set environment variables for GCP project and bucket
os.environ["PROJECT_ID"] = "finsight-ai-442823"
os.environ["BUCKET_NAME"] = "finsight-ai-bucket"
project_id = "finsight-ai-442823"
bucket_name = "finsight-ai-bucket"

# # Function to load CSV file directly from GCS
# def load_csv_from_gcs(bucket_name, file_name):
#     """Loads CSV file from GCS bucket."""
#     client = storage.Client()
#     bucket = client.get_bucket(bucket_name)

#     # Read the CSV file directly into pandas using the gs:// URI
#     df = pd.read_csv(f"gs://{bucket_name}/{file_name}")
#     return df

# # Define the file name
# file_name = "reuters_headlines.csv"

# Load the CSV data from GCS
df = pd.read_csv("/content/reuters_headlines.csv")

# Display the first few rows to verify the data
df

Unnamed: 0,Headlines,Time,Description
0,TikTok considers London and other locations fo...,Jul 18 2020,TikTok has been in discussions with the UK gov...
1,Disney cuts ad spending on Facebook amid growi...,Jul 18 2020,Walt Disney has become the latest company to ...
2,Trail of missing Wirecard executive leads to B...,Jul 18 2020,Former Wirecard chief operating officer Jan M...
3,Twitter says attackers downloaded data from up...,Jul 18 2020,Twitter Inc said on Saturday that hackers were...
4,U.S. Republicans seek liability protections as...,Jul 17 2020,A battle in the U.S. Congress over a new coron...
...,...,...,...
32765,Malaysia says never hired British data firm at...,Mar 20 2018,The Malaysian government and the ruling party ...
32766,Prosecutors search Volkswagen headquarters in ...,Mar 20 2018,German prosecutors said on Tuesday they had se...
32767,McDonald's sets greenhouse gas reduction targets,Mar 20 2018,McDonald's Corp on Tuesday announced an approv...
32768,Pratt & Whitney to deliver spare A320neo engin...,Mar 20 2018,Pratt & Whitney will soon begin deliveries of ...




In [60]:
# Data Preprocessing

# Check for missing values in the dataset
print(df.isnull().sum())

# Drop rows with missing headlines or descriptions
df.dropna(subset=["Headlines", "Description"], inplace=True)

# Basic text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    # Remove punctuation and non-alphanumeric characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

# Apply preprocessing to both Headlines and Descriptions
df["Headlines"] = df["Headlines"].apply(preprocess_text)
df["Description"] = df["Description"].apply(preprocess_text)

# Display the first few rows to check
df.head()


Headlines      0
Time           0
Description    0
dtype: int64


Unnamed: 0,Headlines,Time,Description
0,tiktok considers london and other locations fo...,Jul 18 2020,tiktok has been in discussions with the uk gov...
1,disney cuts ad spending on facebook amid growi...,Jul 18 2020,walt disney has become the latest company to ...
2,trail of missing wirecard executive leads to b...,Jul 18 2020,former wirecard chief operating officer jan m...
3,twitter says attackers downloaded data from up...,Jul 18 2020,twitter inc said on saturday that hackers were...
4,us republicans seek liability protections as c...,Jul 17 2020,a battle in the us congress over a new coronav...


In [61]:
# Categories for classification
categories = {
    "market": ["stock", "market", "indices", "trading"],
    "corporate": ["company", "merger", "acquisition", "earnings", "CEO"],
    "economy": ["GDP", "inflation", "recession", "economy", "policy"],
    "banking": ["bank", "finance", "loans", "interest", "credit"],
    "investment": ["invest", "fund", "portfolio", "mutual", "ETF"],
    "technology": ["fintech", "blockchain", "crypto", "bitcoin", "technology"],
    "commodities": ["oil", "gold", "commodity", "silver"],
    "global": ["trade", "tariff", "agreement", "global", "export"],
    "sustainability": ["ESG", "sustainable", "green", "climate", "environment"],
}

# Function to assign labels based on description content
def assign_label(description):
    for label, keywords in categories.items():
        if pd.notnull(description) and any(keyword.lower() in description.lower() for keyword in keywords):
            return label
    return "other"  # Default label if no keywords match

# Assign labels to the descriptions
df["Label"] = df["Description"].apply(assign_label)

# Display the labeled data
df.head()


Unnamed: 0,Headlines,Time,Description,Label
0,tiktok considers london and other locations fo...,Jul 18 2020,tiktok has been in discussions with the uk gov...,other
1,disney cuts ad spending on facebook amid growi...,Jul 18 2020,walt disney has become the latest company to ...,corporate
2,trail of missing wirecard executive leads to b...,Jul 18 2020,former wirecard chief operating officer jan m...,other
3,twitter says attackers downloaded data from up...,Jul 18 2020,twitter inc said on saturday that hackers were...,other
4,us republicans seek liability protections as c...,Jul 17 2020,a battle in the us congress over a new coronav...,other


In [62]:
!pip install transformers torch




In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # Adjust num_labels as needed
model.to(device)

# Tokenize the dataset
def tokenize_texts(texts, max_length=32):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Example: Sample dataset (replace with your DataFrame `df`)
import pandas as pd
data = {
    "Headlines": ["This is great!", "I dislike this.", "Fantastic work!", "Not good."],
    "Label": ["Positive", "Negative", "Positive", "Negative"]
}
df = pd.DataFrame(data)

# Map labels to numeric values
label_map = {label: idx for idx, label in enumerate(df["Label"].unique())}
df["Label_Num"] = df["Label"].map(label_map)

# Tokenize the text data
tokenized_inputs = tokenize_texts(df["Headlines"].tolist())

# Create TensorDataset
dataset = TensorDataset(
    tokenized_inputs["input_ids"],
    tokenized_inputs["attention_mask"],
    torch.tensor(df["Label_Num"].values)
)

# Split into train and validation datasets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=2)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Mixed Precision Training setup
scaler = GradScaler()

# Training loop
model.train()
for epoch in range(2):  # Set epochs to 2 for demonstration
    running_loss = 0.0
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        # Mixed precision training
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        # Print loss periodically
        if (i + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/2], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item()}")

    print(f"Epoch {epoch+1} finished. Average loss: {running_loss / len(train_loader):.4f}")

# Evaluation mode
model.eval()
all_preds = []
all_labels = []

# Evaluate on validation set
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1 finished. Average loss: 0.7365
Epoch 2 finished. Average loss: 0.7453
Validation Accuracy: 100.00%


In [2]:
# Save the model and tokenizer locally
model_save_path = "./bert_text_classifier"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to ./bert_text_classifier
