#1 General Setup

In [None]:
# Install required libraries
!pip install google-cloud-aiplatform PyPDF2 transformers datasets

# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [None]:
from google.cloud import aiplatform
PROJECT_ID = 'lawgpt-423703'  # Replace with your actual project ID
REGION = 'us-central1'  # Replace with your actual region
aiplatform.init(project=PROJECT_ID, location=REGION)


#Example 5.2.1: Running a custom training job to fine-tune a pre-trained BERT model on a new text classification task

In [None]:
# Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Sample Data (texts and labels)
texts = [
    "Legal contract review is essential.",
    "Court decisions are often complex.",
    "Judicial interpretations can vary.",
    "Contract law requires careful consideration.",
    "AI can assist in legal research.",
    "Lawyers often rely on precedent.",
    "The legal process can be slow.",
    "Understanding case law is important for legal practice.",
    "Legislative changes affect legal outcomes.",
    "Legal principles guide judicial rulings."
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # Example binary labels

# 1. Load BERT Tokenizer and Model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

# 2. Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# 3. Split Dataset into Train and Eval
train_texts, eval_texts, train_labels, eval_labels = train_test_split(texts, labels, test_size=0.2)

# 4. Convert Train and Eval Data to DataFrames
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
eval_df = pd.DataFrame({'text': eval_texts, 'label': eval_labels})

# 5. Convert DataFrames to Hugging Face Dataset Objects
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# 6. Tokenize Both Train and Eval Datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

# 7. Set Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 8. Initialize Trainer with Train and Eval Datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,  # Include eval dataset
)

# 9. Train the Model
trainer.train()

# 10. Optional: Evaluate the Model
results = trainer.evaluate()
print("Evaluation results:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.686733
2,No log,0.68256
3,No log,0.68309


Evaluation results: {'eval_loss': 0.6830898523330688, 'eval_runtime': 0.0915, 'eval_samples_per_second': 21.864, 'eval_steps_per_second': 10.932, 'epoch': 3.0}


#Example 5.2.2: Fine-tuning a ResNet model from TensorFlow Hub for a custom image classification task.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset (10 classes)
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize pixel values to be between 0 and 1
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Define constants
IMG_SIZE = 32  # CIFAR-10 images are 32x32
NUM_CLASSES = 10  # CIFAR-10 has 10 classes

# 1. Create the ResNet50 model (excluding the top classification layers)
base_model = tf.keras.applications.ResNet50(
    weights='imagenet',
    include_top=False,
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

# 2. Freeze the base model
base_model.trainable = False

# 3. Build the model using Functional API
inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = base_model(inputs)  # Apply the pre-trained ResNet50 model
x = layers.GlobalAveragePooling2D()(x)  # Global Average Pooling
x = layers.Dense(256, activation='relu')(x)  # Dense layer
x = layers.Dropout(0.5)(x)  # Dropout for regularization
outputs = layers.Dense(NUM_CLASSES, activation='softmax')(x)  # Output layer for multi-class

# Final model
model = models.Model(inputs, outputs)

# 4. Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 5. Train the model
model.fit(x_train, y_train, epochs=5, validation_split=0.2, batch_size=64)

# 6. Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test accuracy: {accuracy:.2f}')

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 21ms/step - accuracy: 0.1373 - loss: 2.3451 - val_accuracy: 0.2388 - val_loss: 2.0984
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1743 - loss: 2.1657 - val_accuracy: 0.2178 - val_loss: 2.0909
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - accuracy: 0.1817 - loss: 2.1404 - val_accuracy: 0.2528 - val_loss: 2.0518
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1933 - loss: 2.115

# Example 5.4.3 Steps for Creating an Instruction Dataset on GCP

In [None]:
#1 Download the Data from GCP
# Initialize a client for Google Cloud Storage
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('my-legal-data-bucket')
blob = bucket.blob('customer_support_tickets.csv')
# Download the data
blob.download_to_filename('customer_support_tickets.csv')

In [None]:
#2 Data Preprocessing
import pandas as pd
# Load the data
df = pd.read_csv('customer_support_tickets.csv')
# Preprocess the data
df['ticket_description'] = df['ticket_description'].apply(lambda x: x.lower())
df.dropna(inplace=True)
df.to_csv('processed_tickets.csv', index=False)

In [None]:
#3 Transfer the file to your GCS bucket
from google.cloud import storage

# Initialize a client for Google Cloud Storage
client = storage.Client()

# Define the bucket name and the file to upload
bucket_name = 'my-legal-data-bucket'
source_file = 'processed_tickets.csv'  # Local file
destination_blob_name = 'processed_tickets.csv'  # File name in GCS bucket

# Get the bucket
bucket = client.get_bucket(bucket_name)

# Create a new blob (file in GCS)
blob = bucket.blob(destination_blob_name)

# Upload the file to GCS
blob.upload_from_filename(source_file)

print(f"File {source_file} uploaded to {bucket_name}/{destination_blob_name}.")

File processed_tickets.csv uploaded to my-legal-data-bucket/processed_tickets.csv.


In [None]:
#4 Annotation
from google.cloud import aiplatform

# Initialize AI Platform
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Create a dataset
dataset = aiplatform.TabularDataset.create(
    display_name='Customer Support Dataset',
    gcs_source='gs://my-legal-data-bucket/processed_tickets.csv'
)

print(f"Dataset {dataset.display_name} created successfully!")


INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/387426257385/locations/us-central1/datasets/5556756394675273728/operations/6438780396229361664
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/387426257385/locations/us-central1/datasets/5556756394675273728
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/387426257385/locations/us-central1/datasets/5556756394675273728')


Dataset Customer Support Dataset created successfully!


In [None]:
def clean_data_quality(df):
    # Check 1: Remove rows with missing ticket descriptions
    df = df.dropna(subset=['ticket_description'])

    # Check 2: Remove rows with missing customer names
    df = df.dropna(subset=['customer_name'])

    # Check 3: Remove duplicate ticket IDs
    df = df.drop_duplicates(subset=['ticket_id'])

    # Check 4: Trim spaces in ticket descriptions and remove rows where the description is empty after trimming
    df['ticket_description'] = df['ticket_description'].apply(lambda x: x.strip())
    df = df[df['ticket_description'] != '']

    # Check 5: Remove rows with invalid ticket status values
    valid_statuses = ['Open', 'Closed', 'In Progress']
    df = df[df['ticket_status'].isin(valid_statuses)]

    # Check 6: Remove rows with invalid priority values
    valid_priorities = ['High', 'Medium', 'Low']
    df = df[df['priority'].isin(valid_priorities)]

    # Check 7: Ensure ticket creation date format is valid, remove rows with invalid dates
    try:
        df['created_date'] = pd.to_datetime(df['created_date'], format='%Y-%m-%d', errors='coerce')
        df = df.dropna(subset=['created_date'])  # Drop rows where the created_date conversion failed
    except Exception as e:
        print(f"Error in date format: {e}")

    # Check 8: Ensure resolved date is not before created date (when resolved date exists)
    if 'resolved_date' in df.columns:
        df['resolved_date'] = pd.to_datetime(df['resolved_date'], errors='coerce', format='%Y-%m-%d')
        df = df[(df['resolved_date'].isna()) | (df['resolved_date'] >= df['created_date'])]  # Keep rows where resolved_date is after or missing

    # Check 9: Remove rows with missing assigned agent values
    df = df.dropna(subset=['assigned_agent'])

    # Check 10: Remove rows where ticket descriptions are too short (less than 10 characters)
    df = df[df['ticket_description'].apply(len) >= 10]

    print("Data cleaned successfully.")
    return df

# Usage Example
cleaned_df = clean_data_quality(df)


Data cleaned successfully.


In [None]:
#6 Dataset Splitting
from sklearn.model_selection import train_test_split

# Split the data into 70% training and 30% temporary set (which will be further split into validation and test sets)
train_df, temp_df = train_test_split(cleaned_df, test_size=0.3, random_state=42)

# Split the temporary set into 50% validation and 50% test sets (15% of total data for each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the splits as CSV files
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print("Data has been split and saved into train_data.csv, val_data.csv, and test_data.csv")

Data has been split and saved into train_data.csv, val_data.csv, and test_data.csv


In [None]:
#7 Copy the data to your GCS Bucket
from google.cloud import storage

# Initialize the client
client = storage.Client()

# Define your bucket name
bucket_name = 'my-legal-data-bucket'  # Replace with your bucket name
bucket = client.bucket(bucket_name)

# List of files to upload
file_paths = ['train_data.csv', 'val_data.csv', 'test_data.csv']

# Upload files to GCS
for file_path in file_paths:
    # Create a new blob (file) in the bucket
    blob = bucket.blob(file_path)

    # Upload the file
    blob.upload_from_filename(file_path)
    print(f"{file_path} uploaded to {bucket_name}.")

train_data.csv uploaded to my-legal-data-bucket.
val_data.csv uploaded to my-legal-data-bucket.
test_data.csv uploaded to my-legal-data-bucket.


In [None]:
#8 Execute the training Job
from google.cloud import aiplatform

# Initialize Vertex AI
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Define and run the AutoML Tabular training job
job = aiplatform.AutoMLTabularTrainingJob(
    display_name='customer-support-classification',
    optimization_prediction_type='classification',  # Or 'regression' if needed
    optimization_objective='minimize-log-loss',     # Set objective based on your problem
)

# Load the dataset (assuming it has already been created in Vertex AI)
dataset = aiplatform.TabularDataset.create(
    display_name='customer-support-dataset',
    gcs_source=['gs://my-legal-data-bucket/train_data.csv']
)

# Run the training job
model = job.run(
    dataset=dataset,
    target_column='target_column_name',  # Replace with the actual target column name
    model_display_name='customer-support-model',
    training_fraction_split=0.7,
    validation_fraction_split=0.15,
    test_fraction_split=0.15,
    sync=True  # Wait for the job to finish
)

print(f"Model {model.display_name} has been successfully trained!")

In [None]:
#9 Monitoring and Iteration
from google.cloud import aiplatform

# Example: Fetch model performance metrics (simulated for demonstration purposes)
def get_model_metrics(job_id):
    # Normally, you would fetch real metrics like this:
    # metrics = aiplatform.Model(job_id).list_evaluations()

    # For demonstration, let's assume these are the metrics returned:
    metrics = {
        'accuracy': 0.85,
        'precision': 0.8,
        'recall': 0.75,
        'f1_score': 0.77,
        'log_loss': 0.45
    }
    return metrics

# Example: Function to update the dataset based on model performance
def update_dataset_based_on_metrics(metrics):
    print("Current Model Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

    # Example of updating dataset logic based on performance metrics
    # If accuracy is below a threshold, we could augment the dataset
    if metrics['accuracy'] < 0.90:
        print("Accuracy is below the target threshold, considering augmenting dataset or cleaning low-quality data...")
        # Logic to augment dataset goes here
        # For example: clean data, add more training examples, improve feature engineering, etc.

    if metrics['log_loss'] > 0.5:
        print("Log loss is too high, considering improving feature selection or model architecture...")
        # Logic to modify dataset based on log loss
        # For example: remove irrelevant features, or explore feature scaling, etc.

    # You can add more checks based on other metrics like precision, recall, F1 score, etc.

# Example usage
job_id = 'your-job-id'  # Replace with your actual job ID
metrics = get_model_metrics(job_id)
update_dataset_based_on_metrics(metrics)


Current Model Metrics:
accuracy: 0.85
precision: 0.8
recall: 0.75
f1_score: 0.77
log_loss: 0.45
Accuracy is below the target threshold, considering augmenting dataset or cleaning low-quality data...


#Example 5.6 Fine Tuning Example

In [None]:
#1 Install Dependencies
!pip install kaggle google-cloud-storage transformers datasets tensorflow

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:0

In [None]:
#2 Setup ur Kaggle Credentials
import os

# Set Kaggle credentials
os.environ['KAGGLE_USERNAME'] = 'arunpandey2023232323'  # replace with your Kaggle username
os.environ['KAGGLE_KEY'] = '9a1a8bc9c59a02976c389ef6b2ae1688'        # replace with your Kaggle API key

In [None]:
#3 Download Dataset from Kaggle
import kaggle
# Authenticate using the Kaggle API
kaggle.api.authenticate()
# Download the dataset
try:
    kaggle.api.dataset_download_files('kazanova/sentiment140', path='/content/sample_data', unzip=True)
    print("Dataset downloaded successfully!")
except Exception as e:
    print("An error occurred:", e)


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
Dataset downloaded successfully!


In [None]:
#4 Upload Dataset to GCS
# Step 1: Install Google Cloud Storage package
#!pip install --upgrade google-cloud-storage

# Step 2: Authenticate Google Cloud
#from google.colab import auth
#auth.authenticate_user()

# Step 3: Upload Dataset to GCS
from google.cloud import storage

# Initialize a client for Google Cloud Storage
client = storage.Client()

# Specify your bucket name
bucket_name = 'my-legal-data-bucket'  # Replace with your actual bucket name
bucket = client.get_bucket(bucket_name)

# Upload dataset to GCS
blob = bucket.blob('sentiment140.csv')
blob.upload_from_filename('/content/sample_data/sentiment140.csv')
print("Dataset uploaded successfully!")

Dataset uploaded successfully!


In [None]:
#5 Load and Preprocess Data
import pandas as pd

# Load dataset
df = pd.read_csv('sample_data/sentiment140.csv', encoding='latin1', header=None, names=['sentiment', 'id', 'date', 'query', 'user', 'text'])

# Filter relevant columns and preprocess text
df = df[['sentiment', 'text']]
df['text'] = df['text'].apply(lambda x: x.lower())  # Convert to lowercase

# Map sentiments to labels
df['label'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
df = df[['text', 'label']]

# Save processed data
df.to_csv('processed_sentiment140.csv', index=False)

# Upload processed data to GCS
processed_blob = bucket.blob('processed_sentiment140.csv')
processed_blob.upload_from_filename('processed_sentiment140.csv')


In [None]:
#6 Convert CSV to TFRecord
import tensorflow as tf

# Function to create TFRecord example
def create_tf_example(text, label):
    feature = {
        'text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[text.encode()])),
        'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode()]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

# Convert CSV to TFRecord
def csv_to_tfrecord(csv_file, tfrecord_file):
    df = pd.read_csv(csv_file)
    with tf.io.TFRecordWriter(tfrecord_file) as writer:
        for _, row in df.iterrows():
            tf_example = create_tf_example(row['text'], row['label'])
            writer.write(tf_example)

# Convert the CSV to TFRecord
csv_to_tfrecord('processed_sentiment140.csv', 'sentiment140.tfrecord')

# Upload TFRecord to GCS
tfrecord_blob = bucket.blob('sentiment140.tfrecord')
tfrecord_blob.upload_from_filename('sentiment140.tfrecord')



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.cloud import storage

# Load the main dataset
df = pd.read_csv('/content/processed_sentiment140.csv')

# Split the data into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the split datasets locally
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)

# Upload the split datasets to Google Cloud Storage
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your bucket name
bucket = client.bucket(bucket_name)

# Upload the training dataset
train_blob = bucket.blob('train_data.csv')
train_blob.upload_from_filename('train_data.csv')

# Upload the validation dataset
val_blob = bucket.blob('val_data.csv')
val_blob.upload_from_filename('val_data.csv')

print("Train and validation data uploaded to GCS successfully.")

Train and validation data uploaded to GCS successfully.


In [None]:
#7 Write the Training Script and upload to GCS bucket
# Step 1: Create the 'train_bert.py' script
script_content = """
import os
import argparse
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

def main(args):
    # Load dataset
    dataset = load_dataset('csv', data_files={'train': args.train_data, 'validation': args.val_data})

    # Load tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Preprocess function
    def preprocess_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation']
    )

    # Fine-tune the model
    trainer.train()

    # Save the model
    model.save_pretrained(args.model_output)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data", type=str, required=True)
    parser.add_argument("--val_data", type=str, required=True)
    parser.add_argument("--model_output", type=str, required=True)
    args = parser.parse_args()
    main(args)
"""

# Write the script to a file named 'train_bert.py'
with open("/content/sample_data/train_bert.py", "w") as script_file:
    script_file.write(script_content)

# Initialize the GCS client and specify the bucket
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your bucket name
bucket = client.bucket(bucket_name)

# Upload the script to GCS
blob = bucket.blob('train_bert.py')  # You can specify the directory in GCS
blob.upload_from_filename('/content/sample_data/train_bert.py')

print("train_bert.py uploaded to GCS successfully.")


train_bert.py uploaded to GCS successfully.


In [None]:
from google.cloud import aiplatform

# Initialize Vertex AI with the staging bucket
aiplatform.init(
    project='lawgpt-423703',
    location='us-central1',
    staging_bucket='gs://my-legal-data-bucket'  # Replace with your GCS bucket
)

# Create and run a custom training job
job = aiplatform.CustomTrainingJob(
    display_name='bert-finetuning',
    script_path='gs://my-legal-data-bucket/train_bert.py',  # Path to your training script in GCS
    container_uri='us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-6:latest',  # TensorFlow GPU container
    requirements=['transformers', 'datasets']  # Specify additional Python dependencies
)

# Run the training job
model = job.run(
    args=[
        "--train_data", 'gs://my-legal-data-bucket/processed_sentiment140.csv',
        "--val_data", 'gs://my-legal-data-bucket/processed_sentiment140_val.csv',
        "--model_output", 'gs://my-legal-data-bucket/models/bert-finetuned'
    ],
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_type='NVIDIA_TESLA_T4',  # Using a T4 GPU
    accelerator_count=1
)

print(f"Training job finished. Model saved to GCS: {model.display_name}")


In [None]:
# Evaluate the fine-tuned model
results = trainer.evaluate()

# Print evaluation results with a more structured output
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


In [None]:
# Save the fine-tuned model locally
model_output_dir = 'fine_tuned_bert'
model.save_pretrained(model_output_dir)

# Print a success message after saving the model
print(f"Model saved locally at {model_output_dir}")

# Upload the saved model to GCS
from google.cloud import storage

# Initialize the Google Cloud Storage client
client = storage.Client()

# Define your GCS bucket and destination path for the model
bucket_name = 'my-legal-data-bucket'  # Replace with your GCS bucket name
bucket = client.bucket(bucket_name)

# Upload all files related to the saved model to GCS
for filename in ['pytorch_model.bin', 'config.json', 'vocab.txt']:  # Common files saved by save_pretrained
    blob = bucket.blob(f'bert_finetuned/{filename}')  # Specify the GCS folder
    blob.upload_from_filename(f'{model_output_dir}/{filename}')
    print(f"Uploaded {filename} to gs://{bucket_name}/bert_finetuned/{filename}")

print(f"Model successfully uploaded to GCS at gs://{bucket_name}/bert_finetuned/")


In [None]:
from google.cloud import aiplatform

# Initialize Vertex AI
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Define the model upload from the GCS path where the fine-tuned model is stored
model = aiplatform.Model.upload(
    display_name='sentiment140-bert-model',  # Name of the model in Vertex AI
    artifact_uri='gs://my-legal-data-bucket/bert_finetuned',  # Path to the fine-tuned model in GCS
    serving_container_image_uri='us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest'  # TensorFlow serving image
)

# Deploy the model to an endpoint
endpoint = model.deploy(
    machine_type='n1-standard-4',  # Machine type for deployment
    accelerator_type=None,  # No GPU for the deployment (can change if you need GPU)
    sync=True  # Wait for deployment to complete
)

print(f"Model deployed successfully at endpoint: {endpoint.display_name}")

# Test the deployed model
test_texts = ['I love this movie!', 'This movie is terrible.']
predictions = endpoint.predict(instances=test_texts)

# Output predictions
print(f"Predictions: {predictions}")


# Example 5.7.3 Implementing Evaluation and Validation on GCP

In [None]:
#1 Prepare Evaluation Datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from google.cloud import storage

# Initialize Google Cloud Storage client
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your GCS bucket name
bucket = client.bucket(bucket_name)

# Load the dataset from GCS
df = pd.read_csv('gs://my-legal-data-bucket/processed_sentiment140.csv')

# Split the data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Save the splits locally
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

# Upload the training dataset to GCS
train_blob = bucket.blob('train_data.csv')
train_blob.upload_from_filename('train_data.csv')

# Upload the validation dataset to GCS
val_blob = bucket.blob('val_data.csv')
val_blob.upload_from_filename('val_data.csv')

# Upload the test dataset to GCS
test_blob = bucket.blob('test_data.csv')
test_blob.upload_from_filename('test_data.csv')

print("Training, validation, and test datasets uploaded to GCS successfully.")


Training, validation, and test datasets uploaded to GCS successfully.


In [None]:
#2 Evaluate Model Performance
from google.cloud import aiplatform
import pandas as pd
from google.cloud import storage
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Initialize Vertex AI
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Step 2: Load the deployed model using its ID
model = aiplatform.Model(model_name='projects/123456789/locations/us-central1/models/1234567890987654321')  # Replace 'your-model-id' with the actual model ID

# Step 3: Deploy the model to an endpoint
endpoint = model.deploy(machine_type='n1-standard-4')

# Step 4: Load test data from GCS
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your GCS bucket name
bucket = client.bucket(bucket_name)

# Download test data from GCS to local file
test_blob = bucket.blob('test_data.csv')
test_blob.download_to_filename('test_data.csv')

# Load the test data into a DataFrame
test_data = pd.read_csv('test_data.csv')

# Step 5: Get predictions from the deployed model
# Assuming 'text' is the feature and the model expects a list of strings for prediction
predictions = endpoint.predict(instances=test_data['text'].tolist())

# Step 6: Extract predicted labels
predicted_labels = [prediction['predicted_label'] for prediction in predictions.predictions]  # Adjust based on the actual prediction format returned by Vertex AI

# Step 7: Evaluate model performance
true_labels = test_data['label'].tolist()
accuracy = accuracy_score(true_labels, predicted_labels)
report = classification_report(true_labels, predicted_labels)

# Step 8: Print evaluation results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


In [None]:
#3 Use Vertex AI for hyperparameter tuning
from google.cloud import aiplatform

# Initialize Vertex AI
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Define the hyperparameter tuning job
tuning_job = aiplatform.HyperparameterTuningJob(
    display_name='bert-hyperparameter-tuning-job',
    model_display_name='bert-finetuning-model',  # Name for the final tuned model
    script_path='gs://my-legal-data-bucket/train_bert.py',  # Path to the training script in GCS
    container_uri='us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-6:latest',  # TensorFlow GPU container
    parameters=[
        aiplatform.hyperparameter_tuning.DoubleParameterSpec(
            parameter_name='learning_rate', min=0.001, max=0.1
        ),
        aiplatform.hyperparameter_tuning.IntegerParameterSpec(
            parameter_name='batch_size', min=16, max=128
        )
    ],
    max_trial_count=10,  # Number of trials for tuning
    parallel_trial_count=2,  # Number of parallel trials
)

# Run the hyperparameter tuning job
tuning_job.run(
    args=[
        "--train_data", 'gs://my-legal-data-bucket/train_data.csv',
        "--val_data", 'gs://my-legal-data-bucket/val_data.csv',
        "--model_output", 'gs://my-legal-data-bucket/models/tuned_bert_model'
    ],
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_type='NVIDIA_TESLA_T4',  # Using a T4 GPU
    accelerator_count=1
)

print("Hyperparameter tuning job completed.")


In [None]:
#4 Cross Validation
import pandas as pd
from google.cloud import storage
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Step 1: Initialize Google Cloud Storage client
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your GCS bucket name
bucket = client.bucket(bucket_name)

# Step 2: Download dataset from GCS
blob = bucket.blob('processed_data.csv')  # Replace with the correct file in your bucket
blob.download_to_filename('processed_data.csv')

# Step 3: Load the dataset into a pandas DataFrame
df = pd.read_csv('processed_data.csv')

# Step 4: Prepare features and labels for the model
X = df[['feature1', 'feature2']]  # Replace with your actual feature column names
y = df['label']  # Replace with your actual label column name

# Step 5: Initialize the model
model = RandomForestClassifier()

# Step 6: Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Step 7: Output the cross-validation results
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Accuracy: {scores.mean()}')


In [None]:
#6 Monitor Model in Production
from google.cloud import logging
from sklearn.metrics import classification_report, accuracy_score

# Example evaluation metrics (replace with your actual evaluation results)
accuracy = 0.92  # Replace with actual accuracy
report = classification_report([1, 0, 1, 1, 0], [1, 0, 1, 0, 0], output_dict=False)  # Replace with your actual classification report

# Step 1: Initialize Cloud Logging client for the current Google Cloud project
client = logging.Client(project='lawgpt-423703')  # Replace with your project ID
logger = client.logger('model-evaluation')  # Use a descriptive name for your logger (e.g., model-evaluation)

# Step 2: Format and log the evaluation results to Cloud Logging
log_message = f'Model Evaluation Results:\nAccuracy={accuracy}\nClassification Report:\n{report}'
logger.log_text(log_message)

# Step 3: Print a success message to confirm logging
print("Model evaluation results logged to Google Cloud Logging successfully.")


In [None]:
#7 A/B Testing
from google.cloud import aiplatform
import pandas as pd
from google.cloud import storage
from sklearn.metrics import accuracy_score

# Step 1: Initialize Vertex AI
aiplatform.init(project='lawgpt-423703', location='us-central1')

# Step 2: Load the two versions of the model using their respective model IDs
model_v1 = aiplatform.Model(model_name='projects/123456789/locations/us-central1/models/1111111111111111111')  # Replace with actual model ID
model_v2 = aiplatform.Model(model_name='projects/123456789/locations/us-central1/models/2222222222222222222')  # Replace with actual model ID

# Step 3: Deploy both models to separate endpoints
endpoint_v1 = model_v1.deploy(machine_type='n1-standard-4', sync=True)
endpoint_v2 = model_v2.deploy(machine_type='n1-standard-4', sync=True)

print(f"Model v1 deployed at endpoint: {endpoint_v1.display_name}")
print(f"Model v2 deployed at endpoint: {endpoint_v2.display_name}")

# Step 4: Load test data from GCS
client = storage.Client()
bucket_name = 'my-legal-data-bucket'  # Replace with your GCS bucket name
bucket = client.bucket(bucket_name)

# Download test data from GCS to local file
test_blob = bucket.blob('test_data.csv')
test_blob.download_to_filename('test_data.csv')

# Load the test data into a DataFrame
test_data = pd.read_csv('test_data.csv')

# Step 5: Make predictions using both model versions
predictions_v1 = endpoint_v1.predict(test_data['text'].tolist())
predictions_v2 = endpoint_v2.predict(test_data['text'].tolist())

# Step 6: Extract predicted labels
predicted_labels_v1 = [pred['predicted_label'] for pred in predictions_v1.predictions]  # Adjust based on actual prediction format
predicted_labels_v2 = [pred['predicted_label'] for pred in predictions_v2.predictions]  # Adjust based on actual prediction format

# Step 7: Get true labels from the test dataset
true_labels = test_data['label'].tolist()

# Step 8: Evaluate and compare accuracy of both models
accuracy_v1 = accuracy_score(true_labels, predicted_labels_v1)
accuracy_v2 = accuracy_score(true_labels, predicted_labels_v2)

# Step 9: Print out the accuracy results for comparison
print(f'Accuracy for Model v1: {accuracy_v1}')
print(f'Accuracy for Model v2: {accuracy_v2}')