In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install datasets -q
!pip install transformers -q
!pip install sentencepiece -q

[?25l[K     |█▉                              | 10 kB 22.3 MB/s eta 0:00:01[K     |███▊                            | 20 kB 14.1 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 10.1 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 4.4 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 4.4 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 5.1 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 5.6 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 5.3 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 5.9 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 5.0 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████████▏     | 143 kB 5.0 MB/s eta 0:00:01[K  

## Importing the Required Libraries

In [35]:
import numpy as np
import pandas as pd
import datasets
from tqdm.notebook import tqdm
import nltk
import os
import json
import re
import torch
from transformers import PegasusConfig, PegasusTokenizer, PegasusForConditionalGeneration 
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Loading the Dataset

In [8]:
dataset_folder_path = '/content/drive/My Drive/Colab Notebooks/nlp/dataset/'
dataset_path = dataset_folder_path + 'dataset_austin_sentences.csv'
model_folder_path = '/content/drive/My Drive/Colab Notebooks/nlp/EMNLP_folder_4/headline_model'

In [9]:
df_raw = pd.read_csv(dataset_path)

In [10]:
print(df_raw.columns)
df_raw.head(1)

Index(['id', 'text', 'district', 'year'], dtype='object')


Unnamed: 0,id,text,district,year
0,1,"Dissatisfied traffic and with traffic, timing ...",7,2016


# Explore dataset

In [11]:
df_raw[21:28]

Unnamed: 0,id,text,district,year
21,22,(1) You cannot continue to fund city activitie...,10,2016
22,23,2) You cannon expect people in the suburbs to ...,10,2016
23,24,1,8,2016
24,25,Massive road expansion.,8,2016
25,26,Too much traffic.,8,2016
26,27,2,8,2016
27,28,Lower energy costs.,8,2016


# Leave only necessary cols

In [12]:
cols_to_leave = ['text']
df = df_raw[cols_to_leave]
df.head(1)

Unnamed: 0,text
0,"Dissatisfied traffic and with traffic, timing ..."


In [15]:
df["arg_topic"] = df['text']
df["key_point"] = df['text'] # dummy column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
df

Unnamed: 0,text,arg_topic,key_point
0,"Dissatisfied traffic and with traffic, timing ...","Dissatisfied traffic and with traffic, timing ...","Dissatisfied traffic and with traffic, timing ..."
1,EXTREMELY dissatisfied with cit govt.,EXTREMELY dissatisfied with cit govt.,EXTREMELY dissatisfied with cit govt.
2,"interfering in local businesses (Uber/Lyft, in...","interfering in local businesses (Uber/Lyft, in...","interfering in local businesses (Uber/Lyft, in..."
3,"Also, extremely dissatisfied with all the free...","Also, extremely dissatisfied with all the free...","Also, extremely dissatisfied with all the free..."
4,I'm very dissatisfied with the liberal leaning...,I'm very dissatisfied with the liberal leaning...,I'm very dissatisfied with the liberal leaning...
...,...,...,...
6269,You to need to pay teachers better.,You to need to pay teachers better.,You to need to pay teachers better.
6270,This city is too expensive to live in on the s...,This city is too expensive to live in on the s...,This city is too expensive to live in on the s...
6271,Austin Electric company is a monopoly who trea...,Austin Electric company is a monopoly who trea...,Austin Electric company is a monopoly who trea...
6272,My $200 deposit is being held hostage by the c...,My $200 deposit is being held hostage by the c...,My $200 deposit is being held hostage by the c...


In [17]:
df.to_csv(dataset_folder_path + 'gen.csv')

# Load necessary models

In [18]:
config = PegasusConfig.from_pretrained('google/pegasus-xsum')
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model = PegasusForConditionalGeneration.from_pretrained(model_folder_path,local_files_only=True)
print('Models loaded')

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Models loaded


# Load dataset into dataset class

In [21]:
# Get the column names for input/target.
dataset_columns = ('arg_topic', 'key_point')
arg_topic_column = dataset_columns[0]
key_point_column = dataset_columns[1]

prefix=""

# Temporarily set max_target_length for training.
max_source_length = 64
max_target_length = 64
padding = "max_length" 

device='cuda'

ds = datasets.load_dataset('csv', data_files = dataset_folder_path + 'gen.csv')['train']

Using custom data configuration default-ac539246b65c2e27
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-ac539246b65c2e27/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

# Map words to ints

In [22]:
def preprocess_function(examples):
    inputs = examples[arg_topic_column]
    targets = examples[key_point_column]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
ds_preprocessed = ds.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            # remove_columns=column_names,
            load_from_cache_file=False,
        )

  0%|          | 0/7 [00:00<?, ?ba/s]

In [27]:
label_pad_token_id = -100 
# data_collator = default_data_collator

In [28]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

# Use pretrained model (Dummy trainer)

In [29]:
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/My Drive/Colab Notebooks/nlp/EMNLP_folder_4/headline_model',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/My Drive/Colab Notebooks/nlp/EMNLP_folder_4/headline_model_logs',            # directory for storing logs
    save_steps = 4518, #in order to store the last model at the end of 3 epochs
    predict_with_generate=True
)

# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # train_dataset=ds_preprocessed,
    tokenizer=tokenizer,
    # data_collator=data_collator,
    # compute_metrics=compute_metrics
)

In [30]:
test_results = trainer.predict(
      ds_preprocessed,
      # ds_preprocessed.select(range(10)), # debug
      max_length=max_target_length,
      num_beams=6
)

The following columns in the test set don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: Unnamed: 0, key_point, text, arg_topic. If Unnamed: 0, key_point, text, arg_topic are not expected by `PegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 16


In [33]:
preds = tokenizer.batch_decode(
          test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
      )
preds[0:10]

["Traffic is causing a strain on the cities' resources",
 'Citizens have a right not to vote',
 'Hiring a private hire company is financially beneficial',
 'Government intervention has the risk of inserting bias/harming objectivity',
 'People should choose for themselves whether or not to vote',
 "Urbanization is causing a strain on the cities' resources",
 "Urbanization is causing a strain on the cities' resources",
 'People should be allowed to do whatever they want to their own bodies',
 'Cities offer more opportunities',
 'Restrictions on migration would benefit people in the rural areas economically/socially']

In [34]:
df['key_point_pred'] = preds

ValueError: ignored

In [71]:
df_final = df[['text','key_point_pred']]
df_final.head(10)

Unnamed: 0,text,key_point_pred
0,"Dissatisfied traffic and with traffic, timing ...",Traffic is causing a strain on the cities' res...
1,EXTREMELY dissatisfied with cit govt.,Citizens have a right not to vote
2,"interfering in local businesses (Uber/Lyft, in...",Hiring a private hire company is financially b...
3,"Also, extremely dissatisfied with all the free...",Government intervention has the risk of insert...
4,I'm very dissatisfied with the liberal leaning...,People should choose for themselves whether or...
5,Maintenance of city facilities needs to be equ...,Urbanization is causing a strain on the cities...
6,We need to think long-term; Austin can't susta...,Urbanization is causing a strain on the cities...
7,What are we going to do when the cool beautifu...,People should be allowed to do whatever they w...
8,Long after the current boom goes bust (and it ...,Cities offer more opportunities
9,It really is time to stop sacrificing the qual...,Restrictions on migration would benefit people...


In [73]:
df_raw.head(10)

Unnamed: 0,id,text,district,year
0,1,"Dissatisfied traffic and with traffic, timing ...",7,2016
1,2,EXTREMELY dissatisfied with cit govt.,7,2016
2,3,"interfering in local businesses (Uber/Lyft, in...",7,2016
3,4,"Also, extremely dissatisfied with all the free...",7,2016
4,5,I'm very dissatisfied with the liberal leaning...,7,2016
5,6,Maintenance of city facilities needs to be equ...,9,2016
6,7,We need to think long-term; Austin can't susta...,9,2016
7,8,What are we going to do when the cool beautifu...,9,2016
8,9,Long after the current boom goes bust (and it ...,9,2016
9,10,It really is time to stop sacrificing the qual...,9,2016


In [75]:
df_final['district'] = df_raw['district']
df_final['year'] = df_raw['year']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [76]:
df_final.head(10)

Unnamed: 0,text,key_point_pred,district,year
0,"Dissatisfied traffic and with traffic, timing ...",Traffic is causing a strain on the cities' res...,7,2016
1,EXTREMELY dissatisfied with cit govt.,Citizens have a right not to vote,7,2016
2,"interfering in local businesses (Uber/Lyft, in...",Hiring a private hire company is financially b...,7,2016
3,"Also, extremely dissatisfied with all the free...",Government intervention has the risk of insert...,7,2016
4,I'm very dissatisfied with the liberal leaning...,People should choose for themselves whether or...,7,2016
5,Maintenance of city facilities needs to be equ...,Urbanization is causing a strain on the cities...,9,2016
6,We need to think long-term; Austin can't susta...,Urbanization is causing a strain on the cities...,9,2016
7,What are we going to do when the cool beautifu...,People should be allowed to do whatever they w...,9,2016
8,Long after the current boom goes bust (and it ...,Cities offer more opportunities,9,2016
9,It really is time to stop sacrificing the qual...,Restrictions on migration would benefit people...,9,2016


In [78]:
#check
(df_final['text'] == df_raw['text']).all()
(df_final['year'] == df_raw['year']).all()

True

## Saving the predictions of model

In [79]:
df_final.to_csv(dataset_folder_path + 'dataset_austin_sentences_enigma_predictions.csv')

In [81]:
 df_final.head(10)

Unnamed: 0,text,key_point_pred,district,year
0,"Dissatisfied traffic and with traffic, timing ...",Traffic is causing a strain on the cities' res...,7,2016
1,EXTREMELY dissatisfied with cit govt.,Citizens have a right not to vote,7,2016
2,"interfering in local businesses (Uber/Lyft, in...",Hiring a private hire company is financially b...,7,2016
3,"Also, extremely dissatisfied with all the free...",Government intervention has the risk of insert...,7,2016
4,I'm very dissatisfied with the liberal leaning...,People should choose for themselves whether or...,7,2016
5,Maintenance of city facilities needs to be equ...,Urbanization is causing a strain on the cities...,9,2016
6,We need to think long-term; Austin can't susta...,Urbanization is causing a strain on the cities...,9,2016
7,What are we going to do when the cool beautifu...,People should be allowed to do whatever they w...,9,2016
8,Long after the current boom goes bust (and it ...,Cities offer more opportunities,9,2016
9,It really is time to stop sacrificing the qual...,Restrictions on migration would benefit people...,9,2016
