## Import libraries

In [1]:
!pip install pytorch-ignite
!pip install torcheval
!pip install optuna
!pip install plotly



In [2]:
COLAB = False

In [3]:
if COLAB:
    import sys
    import os
    from google.colab import drive

    # Mount Google Driveroo
    drive.mount('/content/drive')

    # Add the path to the Python module
    root_dir = '/content/drive/MyDrive/text_summarization'
    sys.path.append(os.path.join(root_dir, 'src'))
    sys.path.append(os.path.join(root_dir, 'src', 'utils'))
else:
    from pathlib import Path
    root_dir = Path.cwd().parent
root_dir

PosixPath('/home/jovyan/text_summarization')

In [4]:
import pandas as pd
import numpy as np
import os
import re
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from utils.preprocessing import preprocessing_pipeline, get_data_distribution
from utils.processing import processing_pipeline
from train_model import main, tuning
import utils.inference as inference
from optuna.visualization import (plot_optimization_history,
                                  plot_param_importances, plot_slice)

2025-06-01 13:06:43.912103: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-01 13:06:43.944327: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-01 13:06:43.944340: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-01 13:06:43.945140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-01 13:06:43.950051: I tensorflow/core/platform/cpu_feature_guar

In [6]:
PROCESSING = True
PREPROCESSING = True
HYP_TUNING = True

In [7]:
name = "WikiHow"

In [8]:
raw_dir = os.path.join(root_dir, "raw_data", name)
dataset_dir = os.path.join(root_dir, "data", name)
figures_dir = os.path.join(root_dir, "figures", name)
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(figures_dir, exist_ok=True)

## Get the data

In [9]:
dataset_df = pd.read_csv(os.path.join(raw_dir, "wikihowSep.csv"))

In [10]:
csv_name = "wikihow_data"

## Preprocess the data

In [11]:
if PREPROCESSING:
    preprocessing_pipeline(dataset_df, stopwords, dataset_dir, csv_name, subset_size = 0.5, start_token = "SOS ", end_token = " EOS")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = start_token + df[column_name] + end_token
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'length_{column_name}'] = df[column_name].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = start_token + df[column_name] + end_token
A value

In [12]:
dataset_df = pd.read_csv(os.path.join(root_dir, "data", name, f'{csv_name}.csv'))
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672194 entries, 0 to 672193
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   summary         672194 non-null  object
 1   text            672194 non-null  object
 2   length_summary  672194 non-null  int64 
 3   length_text     672194 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 20.5+ MB


In [13]:
dataset_df.head()

Unnamed: 0,summary,text,length_summary,length_text
0,SOS put yourself out there EOS,SOS its simpleif you dont put yourself into ne...,6,40
1,SOS hold your finger down to open closing opti...,SOS on some devices running dolphin you can al...,10,31
2,SOS combine beeswax and turpentine to create a...,SOS mix 1 part beeswax with 3 parts turpentine...,10,22
3,SOS let up on the clutch while pushing down on...,SOS in order to get moving lift your left foot...,13,172
4,SOS locate the song you would like to rate EOS,SOS using the scroll wheel locate the song tha...,10,15


## Get distribution of the data

In [14]:
get_data_distribution(dataset_df, figures_dir, "wikihow")

## Process the datas

In [15]:
load_tokenizer = False

In [16]:
if PROCESSING:
    processing_pipeline(dataset_dir, csv_name, load_tokenizer = load_tokenizer)

Max length of text: 241
Max length of summary: 30
SOS token index: 1
EOS token index: 2
UNK token index: 3
PAD token index: 0
Number of Samples in X_train: 470535
Number of Samples in X_val: 120995
Number of Samples in X_test: 80664
Number of Samples in y_train: 470535
Number of Samples in y_val: 120995
Number of Samples in y_test: 80664
Vocabulary size: 566287
x_train.pt, shape: (470535, 241)
x_val.pt, shape: (120995, 241)
x_test.pt, shape: (80664, 241)
y_train.pt, shape: (470535, 30)
y_val.pt, shape: (120995, 30)
y_test.pt, shape: (80664, 30)


### Test the processing

In [17]:
test_decoding = True

In [18]:
if test_decoding:
    import torch
    import pickle
    import random
    
    def decode_data(text_ids, index2word, EOS_token):
        """
        Converts the text ids to words using the index2word mapping.
        """
        if text_ids.dim() > 1:
            text_ids = text_ids.view(-1)  # Flatten to 1D
    
        decoded_words = []
        for idx in text_ids:
            # Ensure idx is a scalar
            if isinstance(idx, torch.Tensor):
                idx = idx.item()
            if idx == EOS_token:
                decoded_words.append('EOS')
                break
            decoded_words.append(index2word.get(idx, 'UNK'))
    
        return " ".join(decoded_words)
    
    X_train = torch.load(os.path.join(dataset_dir, "x_train.pt"))
    y_train = torch.load(os.path.join(dataset_dir, "y_train.pt"))
    
    train_dataloader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(X_train, y_train),
        batch_size=1,
        shuffle=False,
    )
    with open(os.path.join(dataset_dir, 'feature_tokenizer.pickle'), 'rb') as handle:
            feature_tokenizer = pickle.load(handle)
    EOS_token = feature_tokenizer.word2index.get("EOS", 2)
    
    nb_decoding_test = 10
    count_test = 0
    random_list = random.sample(range(len(train_dataloader)), nb_decoding_test)
    for i, data in enumerate(train_dataloader):
        if i in random_list:
            input_tensor, target_tensor = data
            print('Input: {}'.format(decode_data(input_tensor[0], feature_tokenizer.index2word, EOS_token)))
            print('Target: {}'.format(decode_data(target_tensor[0], feature_tokenizer.index2word, EOS_token)))
            print('-----------------------------------')
            count_test += 1
        if count_test == nb_decoding_test:
            break

Input: SOS your spam folder will likely take care of obvious junk mail but that doesnt mean you should open emails from services websites or even people whom you dont know similarly dont open unnecessary emails even if you do trust the recipient eg best buy or tumblr EOS
Target: SOS avoid opening emails from recipients you dont recognize EOS
-----------------------------------
Input: SOS your students may be young children in a school where you are already a teacher or they may be adults who report to you in a marketing firm where you are a manager EOS
Target: SOS identify the age group of your storytelling class EOS
-----------------------------------
Input: SOS find a friend with a more neutral american accent and ask him or her for help plan to meet them in a comfortable place where you can talk about a variety of topics like a coffee shop or on a shopping trip and explain your plan to practice speaking with less of an accent decide with your friend how you will practice you might a

## Train the model

In [22]:
hidden_size = 256
max_length = 100
lr = 0.001
weight_decay = 1e-6
batch_size = 128
num_workers = 2
n_epochs = 100
print_example_every = 10
load_checkpoint = False
early_stopping_patience = 5

optimizer_hyperparams = {
    'learning_rate': lr,
    'weight_decay': weight_decay,
    'n_epochs': n_epochs,
    'batch_size': batch_size,
    'num_workers': num_workers,
    'early_stopping_patience': early_stopping_patience
}

model_hyperparams = {
    'hidden_size': hidden_size,
    'max_length': max_length
}

In [23]:
main(root_dir = root_dir,
    model_hyperparams=model_hyperparams,
    tuning = tuning, 
    optimizer_hyperparams=optimizer_hyperparams,
    print_examples_every=print_example_every,
    load_checkpoint=load_checkpoint,
    name=name
    )

TypeError: main() got an unexpected keyword argument 'model_hyperparams'

### Making inference with the model (on a CPU)

In [None]:
checkpoint_name = 'best_checkpoint.tar'

In [None]:
while True:
    input_tensor = input("Enter the text to summarize (or type 'exit' to quit): ")
    if input_tensor.lower() == 'exit':
        break
    # Don't forget to water your plants, they need it to survive.
    inference.main(root_dir, name, checkpoint_name, hidden_size, max_length, input_tensor)

### Hyperparameters tuning

In [None]:
num_trials = 10

In [None]:
if HYP_TUNING:
    study = tuning(root_dir, num_trials, name)
    # Save the study results
    study_dir = os.path.join(root_dir, 'parameters_tuning', name, 'study_results')
    os.makedirs(study_dir, exist_ok=True)
    
    # Save the optimization history
    plot_optimization_history(study)
    
    # Save the parameter importances
    plot_param_importances(study)
    
    # Save the slice plot
    plot_slice(study)

### Check training information with tensorboard

In [None]:
# %load_ext tensorboard
# !tensorboard --logdir='tensorboard_logs'