In [2]:
import pandas as pd


train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
validate_df = pd.read_csv('validation.csv')


In [3]:
import re
def preprocess_text(text):
    if isinstance(text, str):
        # Remove unwanted characters, links, and unnecessary spaces
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\[[^]]*\]', '', text)  # Remove references in square brackets
        text = text.strip()
    return text

# Apply preprocessing to the DataFrames
train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['abstract'] = train_df['abstract'].apply(preprocess_text)

test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['abstract'] = test_df['abstract'].apply(preprocess_text)

validate_df['article'] = validate_df['article'].apply(preprocess_text)
validate_df['abstract'] = validate_df['abstract'].apply(preprocess_text)

# Check the preprocessed data
print(train_df.head())
print(test_df.head())
print(validate_df.head())

                                             article  \
0  a recent systematic analysis showed that in 20...   
1  it occurs in more than 50% of patients and may...   
2  tardive dystonia ( td ) , a rarer side effect ...   
3  lepidoptera include agricultural pests that , ...   
4  syncope is caused by transient diffuse cerebra...   

                                            abstract  
0  background : the present study was carried out...  
1  backgroundanemia in patients with cancer who a...  
2  tardive dystonia ( td ) is a serious side effe...  
3  many lepidopteran insects are agricultural pes...  
4  we present an unusual case of recurrent cough ...  
                                             article  \
0  anxiety affects quality of life in those livin...   
1  small non - coding rnas are transcribed into m...   
2  ohss is a serious complication of ovulation in...   
3  congenital adrenal hyperplasia ( cah ) refers ...   
4  type 1 diabetes ( t1d ) results from the destr... 

In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained model and tokenizer from Hugging Face
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch

# Function to tokenize the dataset
def tokenize_data(df):
    tokenized_inputs = []
    tokenized_targets = []
    
    # Drop rows with NaN values in 'article' or 'abstract'
    df = df.dropna(subset=['article', 'abstract'])
    
    for article, abstract in zip(df['article'], df['abstract']):
        # Tokenize input text
        input_ids = tokenizer.encode("summarize: " + article, max_length=512, truncation=True, padding='max_length', return_tensors="pt")
        tokenized_inputs.append(input_ids)
    
        # Tokenize target text (abstract)
        target_ids = tokenizer.encode(abstract, max_length=150, truncation=True, padding='max_length', return_tensors="pt")
        tokenized_targets.append(target_ids)
    
    dataset = {
        'input_ids': torch.cat(tokenized_inputs, dim=0),
        'labels': torch.cat(tokenized_targets, dim=0),
    }
    
    return dataset

# Example usage with your DataFrame
train_data = tokenize_data(train_df)
validate_data = tokenize_data(validate_df)

# Function to summarize text
def summarize_text(article_text):
    inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("C:\\Users\\HP\\archive\\app.py")
model = T5ForConditionalGeneration.from_pretrained("C:\\Users\\HP\\archive\\app.py")

def summarize_text(article_text):
    inputs = tokenizer("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=2, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


RuntimeError: Internal: could not parse ModelProto from C:\Users\HP\archive\app.py

In [8]:
pip install torch torchvision torchaudio


Collecting typing-extensions>=4.8.0
  Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting mkl<=2021.4.0,>=2021.1.1
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl (228.5 MB)
Collecting intel-openmp==2021.*
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl (3.5 MB)
Collecting tbb==2021.*
  Using cached tbb-2021.13.0-py3-none-win_amd64.whl (286 kB)
Installing collected packages: tbb, intel-openmp, typing-extensions, mkl
  Attempting uninstall: tbb
    Found existing installation: TBB 0.2
Note: you may need to restart the kernel to use updated packages.


ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


In [10]:

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenize the dataset
def tokenize_data(example):
    input_text = "summarize: " + example['article']
    target_text = example['abstract']
    inputs = tokenizer(input_text, max_length=512, truncation=True, padding='max_length')
    targets = tokenizer(target_text, max_length=150, truncation=True, padding='max_length')
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': targets['input_ids']}

tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

# Train the model
trainer.train()


SyntaxError: invalid syntax (2400926375.py, line 1)