In [36]:
import pandas as pd

In [37]:
data=pd.read_csv('Train_12.csv')

In [None]:
data.head()

In [None]:
data.shape

In [40]:
def drop_na_text_locs(df):
    # Drop rows where 'text' or 'location' columns have NaN values
    df2 = df.dropna(subset=['text', 'location'])
    return df2


In [41]:
df = drop_na_text_locs(data)

In [None]:
len(df)

In [None]:
df['input_text'] = "extract locations: " + df['text'] 
#+ " What are the locations mentioned in the paragraph?"

In [None]:
df.head()

In [None]:
df.shape

In [81]:
import re

def clean_text(text):
  if pd.isna(text):
    return 'none'
  else:
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
df['clean_text'] = df['input_text'].apply(clean_text)

In [None]:
max_len = max(df['clean_text'].apply(lambda row: len(row.split())))
print(max_len)

In [57]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Initialize tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [86]:
source_max_len = 64
target_max_len = 32

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
len(train_df)

In [90]:
inputs_train = tokenizer(train_df['clean_text'].tolist(),max_length=64, padding=True, truncation=True, return_tensors="pt")
inputs_val = tokenizer(val_df['clean_text'].tolist(),max_length=64, padding=True, truncation=True, return_tensors="pt")

In [None]:
with tokenizer.as_target_tokenizer():
        labels_train = tokenizer(train_df['location'].tolist(), max_length=32, padding=True, truncation=True, return_tensors="pt")
        labels_val = tokenizer(val_df['location'].tolist(), max_length=32, padding=True, truncation=True, return_tensors="pt")

In [93]:
input_inputs_train = inputs_train['input_ids']
input_masks_train = inputs_train['attention_mask']
output_labels_train=labels_train['input_ids']
output_masks_train = labels_train['attention_mask']
input_inputs_val = inputs_val['input_ids']
input_masks_val = inputs_val['attention_mask']
output_labels_val=labels_val['input_ids']
output_masks_val = labels_val['attention_mask']

In [94]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim

In [98]:
batch_size = 16

train_data_t = TensorDataset(input_inputs_train, input_masks_train, output_labels_train, output_masks_train)
train_sampler = RandomSampler(train_data_t)
train_dataloader = DataLoader(train_data_t, sampler=train_sampler, batch_size=batch_size)

val_data_t = TensorDataset(input_inputs_val, input_masks_val, output_labels_val, output_masks_val)
val_sampler = RandomSampler(val_data_t)
val_dataloader = DataLoader(val_data_t, sampler=val_sampler, batch_size=batch_size)

In [None]:
test_df = pd.read_csv(r"C:\Users\bindu\Desktop\Zindi_team\Test_10.csv")
test_df.head()

In [13]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class QA_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len=64, target_len=32):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.target_len = target_len
        self.contexts = self.data['context']
        self.questions = self.data['question']
        self.answers = self.data['answer']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]
        
        # Tokenize inputs and outputs
        input_text = f"question: {question} context: {context}"
        input_encodings = self.tokenizer(input_text, max_length=self.source_len, padding='max_length', truncation=True, return_tensors='pt')
        target_encodings = self.tokenizer(answer, max_length=self.target_len, padding='max_length', truncation=True, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        target_ids = target_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')


dataset = QA_Dataset(data, tokenizer)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [110]:
import torch
from datasets import Dataset

train_ids = inputs_train['input_ids']
train_mask = inputs_train['attention_mask']
train_labels = labels_train['input_ids']

val_ids = inputs_val['input_ids']
val_mask = inputs_val['attention_mask']
val_labels =labels_val['input_ids']

train_ds = Dataset.from_dict({'input_ids': train_ids,
            'attention_mask': train_mask,
            'labels': train_labels})

val_ds = Dataset.from_dict({'input_ids': val_ids,
            'attention_mask': val_mask,
            'labels': val_labels})

In [100]:
# Create DataLoader
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)

In [None]:
train_ds

In [None]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-t5')
tokenizer.save_pretrained('./fine-tuned-t5')


In [None]:
# Save the model
model.save_pretrained('./fine-tuned-t5')
tokenizer.save_pretrained('./fine-tuned-t5')

In [116]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./fine-tuned-t5')
tokenizer = T5Tokenizer.from_pretrained('./fine-tuned-t5')


In [None]:
test_df = pd.read_csv('Test_10.csv')
test_df.head()

In [123]:
test_df['clean_text'] = test_df['text'].apply(clean_text)

In [124]:
inputs_test = tokenizer(test_df['clean_text'].tolist(),max_length=64, padding=True, truncation=True, return_tensors="pt")

In [None]:

output_sequences = model.generate(
    input_ids=inputs_test["input_ids"], 
    attention_mask=inputs_test["attention_mask"],
    max_length=64, 
    num_beams=5,    
    temperature=0.7 
)

decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in output_sequences]
