In [None]:
!pip install accelerate -U

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install -U -q PyDrive

In [None]:
!pip install zipfile

In [None]:
!pip install gdown

In [None]:
import os
os.environ['WANDB_SILENT'] = 'true'

In [None]:
# !gdown --id <File ID>
!gdown --id 1gvsKk1h9X9H9cVaqi6cRo4WCiu0duznr

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset

In [None]:
# Device Selection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
import os
import zipfile
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/kaggle/input/googlekey/cnn3-408602-f8776ea41ea4.json'



In [None]:
gauth = GoogleAuth()
drive = GoogleDrive(gauth)

In [None]:
import os
import zipfile

# Path to the ZIP file containing the model
model_zip_path = '/kaggle/working/SummaryFlow_Run_4.zip'

# Create the directory to extract the model
os.makedirs('/kaggle/working/model', exist_ok=True)

# Extract the model from the ZIP file
with zipfile.ZipFile(model_zip_path, 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/model')

In [None]:
model_checkpoint = '/kaggle/working/model'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [None]:
# Read Data From Excel Files
train_df = pd.read_excel(r'/kaggle/input/exceld/TrainData.xlsx')
test_df = pd.read_excel(r'/kaggle/input/exceld/TestData.xlsx')
val_df = pd.read_excel(r'/kaggle/input/exceld/ValidationData.xlsx')

In [None]:
# Data Preprocessing
def preprocess_text(text):
    return text.strip("[]").replace("'", "").replace("\"", "")

train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

In [None]:
# Put The Data In Form Of Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Data Preprocessing
def get_feature(batch):
    encodings = tokenizer(batch['text'], text_target=batch['summary'], max_length=1024, truncation=True)
    encodings = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': encodings['labels']}
    return encodings

train_dataset = train_dataset.map(get_feature, batched=True)
test_dataset = test_dataset.map(get_feature, batched=True)
val_dataset = val_dataset.map(get_feature, batched=True)

In [None]:
# Put The Data In Form Of Torch Tensors
columns = ['input_ids', 'labels', 'attention_mask']
train_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)

In [None]:
# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
model = model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/SummaryFlow Run',  # Output directory for checkpoints and evaluation results
    overwrite_output_dir=False,  # Do not overwrite the output directory
    num_train_epochs=1,  # Number of training epochs to run
    per_device_train_batch_size=2,  # Batch size per GPU
    per_device_eval_batch_size=2,  # Batch size for evaluation per GPU
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay for regularization
    logging_steps=10,  # Log every N steps
    evaluation_strategy='steps',  # Evaluate every `eval_steps` steps
    eval_steps=500,  # Number of steps between evaluations
    save_steps=3000,  # Save checkpoint every N steps
    gradient_accumulation_steps=8,  # Number of gradient accumulation steps
    load_best_model_at_end=True,  # Load the best model from the checkpoint at the end of training
    metric_for_best_model="eval_loss",  # Metric to use for determining the best model
    greater_is_better=False,
    resume_from_checkpoint='latest_checkpoint'# Whether the best model should have a higher metric value
)

In [None]:
# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [None]:
import wandb

wandb.login(key="d4d3b2430fb32f105f7640e67c1d04be00ea9c11")

In [None]:
# Train the model for a new epoch
trainer.train()

In [None]:
# Save the trained model
trainer.save_model('/kaggle/working/SummaryFlow_Run')

In [None]:
# Evaluate On Test Dataset
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

In [None]:
import zipfile
import os
# Define the folder to zip
folder_to_zip = '/kaggle/working/SummaryFlow_Run'

# Define the path for the zip file
zip_file_path = '/kaggle/working/SummaryFlow_Run_5.zip'

# Create a Zip file
with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
    for folder_name, subfolders, file_names in os.walk(folder_to_zip):
        for file_name in file_names:
            file_path = os.path.join(folder_name, file_name)
            zip_file.write(file_path, os.path.relpath(file_path, folder_to_zip))


In [None]:
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

# Load the service account credentials
creds = Credentials.from_service_account_file('/kaggle/input/googlekey/cnn3-408602-f8776ea41ea4.json')

# Build the drive service
drive_service = build('drive', 'v3', credentials=creds)

# Create a media file upload object
media = MediaFileUpload(zip_file_path, mimetype='application/zip')

# Create a new file on Google Drive
file_metadata = {
    'name': 'SummaryFlow_Run_5.zip',
    'mimeType': 'application/zip'
}
file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()

print('Uploaded file with ID {}'.format(file.get('id')))

In [None]:
from googleapiclient.errors import HttpError

# After uploading the file...
try:
    def callback(request_id, response, exception):
        if exception:
            # Handle error
            print(exception)
        else:
            print("Permission Id: %s" % response.get('id'))

    batch = drive_service.new_batch_http_request(callback=callback)
    anyone_permission = {
        'type': 'anyone',
        'role': 'reader',
    }
    batch.add(drive_service.permissions().create(
        fileId=file.get('id'),
        body=anyone_permission,
        fields='id',
    ))
    batch.execute()
except HttpError as error:
    print(f'An error occurred: {error}')

In [None]:
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build

# Load the service account credentials
creds = Credentials.from_service_account_file('/kaggle/input/googlekey/cnn3-408602-f8776ea41ea4.json')

# Build the drive service
drive_service = build('drive', 'v3', credentials=creds)

# List the first 100 files in the service account's Google Drive
results = drive_service.files().list(pageSize=100, fields="nextPageToken, files(id, name)").execute()
items = results.get('files', [])

if not items:
    print('No files found.')
else:
    print('Files:')
    for item in items:
        print(f'{item["name"]} ({item["id"]})')