### DEEP LEARNING

# **TEXT SUMMARIZATION MODEL**

**IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import ast
from sklearn.model_selection import train_test_split
import re

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

**DATA LOADING AND PREPROCESSING**

In [None]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")

In [23]:
print(train_df['id'].iloc[1])
print(train_df['article'].iloc[136740])
print(train_df['summary'].iloc[136740])
print(train_df.shape)
print(test_df.shape)

0001d1afc246a7964130f43ae940af6bc6c57f01
Arkansas police have arrested a neighbor in the killing of a six-year-old girl whose body was found two days before Thanksgiving. Zachary Dewayne Holly, 28, of Bentonville, Arkansas, is being charged with capital murder, kidnapping and residential burglary and is being held in the local county jail. Little Jersey Bridgeman was reported missing the morning of November 20, and her body was found minutes after a search began in an abandoned house two doors from her home. Jersey, whose . father and stepmother were jailed last year for chaining her to a . dresser, disappeared from her bedroom at her home in Bentonville on . Monday night. Scroll down for video . Murdered: Police are questioning sex offenders in the death of 6-year-old girl Jersey Bridgeman whose body was found after she disappeared from her home on Monday night . Tragic: Jersey Bridgeman was chained to a dresser last year by her father and stepmom before moving to live with her biolog

**CUSTOM DATASET**

In [21]:
class NewsDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length=512):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = str(self.articles[idx])
        summary = str(self.summaries[idx])
        
        article_encoding = self.tokenizer(
            article,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        summary_encoding = self.tokenizer(
            summary,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'article_input_ids': article_encoding['input_ids'].flatten(),
            'article_attention_mask': article_encoding['attention_mask'].flatten(),
            'summary_input_ids': summary_encoding['input_ids'].flatten(),
            'summary_attention_mask': summary_encoding['attention_mask'].flatten()
        }
    
train_dataset = NewsDataset(
    train_df['article'].values,
    train_df['summary'].values,
    tokenizer
)

test_dataset = NewsDataset(
    test_df['article'].values,
    test_df['summary'].values,
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Testing samples: {len(test_dataset)}")

Training samples: 287114
Testing samples: 11491
