In [1]:
from scripts.make_dataset import generate_full_sentence_dataset, sample_sentence_dataset, t5_apply_tokenization, crf_preprocess
from scripts.model import t5_train_model, crf_train_model, crf_evaluate_canonical_predictions, naive_evaluate_canonical_predictions
import torch

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.get_device_name(0))

True
2.6.0+cu124
12.4
90100
NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
# List of stations (official names), their alternative names, and misspellings
stations = {
    "National Stadium": ["nat stadium", "national sport stadium", "national stadiam", "national stadum", "national studiam", "national statium", "ntl stadium", 
                            "sanam keela", "sports stadium", "stadium bts", "stadium national", "the national stadium"],
    "Siam": ["sayam", "siaam", "siamm", "siyam", "syam"],
    "Ratchadamri": ["rachadamri", "rachadumri", "radchadamri", "rajadamri", "ratchadamli", "ratchadamree", "ratchadumri", "ratchdamri", "ratjadamri"],
    "Sala Daeng": ["sala daeng", "sala dange", "sala deng", "saladaeng", "saladang", "salah daeng"],
    "Chong Nonsi": ["chong non si", "chong nonsie", "chong nonsii", "chong nonsri", "chong nonsy", "chongnonsee", "chongnonsi"],
    "Saint Louis": ["saint luis", "saint louiz", "st louie", "st louis", "st luis", "st. lewis", "st. louis"],
    "Surasak": ["suracak", "surasack", "surasek", "surassak"],
    "Saphan Taksin": ["sapan taksin", "saphan taksen", "saphan takshin", "saphan taksine", "saphan takzin", "saphan taxin", "saphantaksin", "taksin bridge"]
}


# List of templates for model training, with one station placeholder [STATION]
# All words, including "?", must be separated by space, as we use simple white space tokenizer
base_templates = [
    "Can I reach [STATION] from Chulalongkorn University ?",
    "Can I take MRT to get to [STATION] ?",
    "Can I walk from [STATION] to MBK ?",
    "Can I walk to [STATION] ?",
    "How do I get to [STATION] ?",
    "How far is [STATION] from Pratunam ?",
    "How to go from the airport to [STATION] ?",
    "Is [STATION] close to the city center ?",
    "Is [STATION] open at night ?",
    "Is [STATION] open now ?",
    "What BTS line is [STATION] on ?",
    "Where is [STATION] ?",
    "Which exit should I use at [STATION] ?",
    "Which line is [STATION] on ?",
    "How can I get to [STATION] ?",
    "Is it possible to walk to [STATION] from here ?",
    "Is [STATION] served by the BTS ?",
    "Which BTS stop is closest to [STATION] ?",
    "What is the best way to reach [STATION] ?",
    "Do I need to transfer to get to [STATION] ?",
    "Is [STATION] on the Silom or Sukhumvit line ?",
    "Can I travel to [STATION] using the Skytrain ?",
    "How far is [STATION] from central Bangkok ?",
    "What is the location of [STATION] ?",
    "Where can I find [STATION] ?",
    "Is [STATION] near popular attractions ?",
    "Does BTS stop at [STATION] ?",
    "What is the fastest way to reach [STATION] ?",
    "Can I access [STATION] via BTS ?"
]


# List of templates for model training, with two station placeholder, [STATION_1] and [STATION_2]
# All words, including "?", must be separated by space, as we use simple white space tokenizer
# [STATION_1] must come first, followed by [STATION_2]
two_station_templates = [
    "How to go from [STATION_1] to [STATION_2] ?",
    "Can I ride BTS from [STATION_1] to [STATION_2] directly ?",
    "Is there a transfer between [STATION_1] and [STATION_2] ?",
    "Which station do I change at when going from [STATION_1] to [STATION_2] ?",
    "How many stops between [STATION_1] and [STATION_2] ?",
    "How long does it take to go from [STATION_1] to [STATION_2] ?",
    "Does [STATION_1] connect to [STATION_2] ?",
    "What is the BTS route from [STATION_1] to [STATION_2] ?",
    "Is [STATION_1] on the same line as [STATION_2] ?",
    "How many minutes between [STATION_1] and [STATION_2] by BTS ?",
    "Which line should I take from [STATION_1] to reach [STATION_2] ?",
    "Do I need to change lines from [STATION_1] to [STATION_2] ?",
    "Is there a direct BTS line connecting [STATION_1] and [STATION_2] ?",
    "Can I go from [STATION_1] to [STATION_2] without switching ?",
    "What is the fastest way to get from [STATION_1] to [STATION_2] by Skytrain ?",
    "Is it possible to travel from [STATION_1] to [STATION_2] without transfer ?",
    "Which interchange connects [STATION_1] to [STATION_2] ?",
    "How many stations are between [STATION_1] and [STATION_2] ?",
    "Where do I need to transfer to go from [STATION_1] to [STATION_2] ?",
    "How do I commute from [STATION_1] to [STATION_2] using BTS ?"
]


# Prefixes and suffixes to add before/after station name
prefixes = ["", "BTS ", "Skytrain "]
suffixes = ["", " station", " BTS", " BTS stop"]


# List of keys, with their neighbors, to be used in generation of corrupt station names
keyboard_neighbors = {
    'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx', 'e': 'wsdr',
    'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb', 'i': 'ujko', 'j': 'huikmn',
    'k': 'jiolm', 'l': 'kop', 'm': 'njk', 'n': 'bhjm', 'o': 'iklp',
    'p': 'ol', 'q': 'wa', 'r': 'edft', 's': 'awedxz', 't': 'rfgy',
    'u': 'yhji', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc', 'y': 'tghu',
    'z': 'asx'
}


# List of characters, with their alternative phonetics, to be used in generation of corrupt station names
phonetic_rules = {
    "ph": "f", "ch": "sh", "sh": "ch", "r": "l", "l": "r",
    "d": "t", "t": "d", "k": "g", "g": "k"
}

In [4]:
# A. Get dataset
print("\nA. Get dataset")

# Step A1. Generate full sentence dataset
print("\nStep A1. Generate full sentence dataset")
df_train, df_test, df_train_bio, df_test_bio = generate_full_sentence_dataset(stations, base_templates, two_station_templates, prefixes, suffixes, keyboard_neighbors, phonetic_rules)

# Step A2. Shuffle and randomly sample the full sentence dataset to get train, validation, and test set
print("\nStep A2. Shuffle and randomly sample the full sentence dataset to get train, validation, and test set")
train_sample_size = 8000
val_sample_size = 1000
test_sample_size =1000

df_train_small, df_val_small, df_test_small, df_train_bio_small, df_test_bio_small = sample_sentence_dataset(df_train, df_test, df_train_bio, df_test_bio, train_sample_size, val_sample_size, test_sample_size)



A. Get dataset

Step A1. Generate full sentence dataset
Saved 6003714 training and 1501092 test set of full sentence dataset to data\processed folder.

Step A2. Shuffle and randomly sample the full sentence dataset to get train, validation, and test set
Saved smaller version of train, validation, and test set to data\outputs folder.


In [None]:
# Comment out to load data from saved CSV outputs, instead of generating new ones from above
# import pandas as pd
# df_train_small = pd.read_csv("data/outputs/bts_train_data_small.csv")
# df_val_small = pd.read_csv("data/outputs/bts_val_data_small.csv")
# df_test_small = pd.read_csv("data/outputs/bts_test_data_small.csv")
# df_train_bio_small = pd.read_csv("data/outputs/bts_train_data_bio_small.csv")
# df_test_bio_small = pd.read_csv("data/outputs/bts_test_data_bio_small.csv")

In [6]:
# Step A3. T5: preprocess train data
print("\nStep A3. T5: preprocess train data")
train_ds, eval_ds, test_ds = t5_apply_tokenization(df_train_small, df_val_small, df_test_small)


Step A3. T5: preprocess train data


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Data is preprocessed and ready for T5 model training.


In [7]:
# Step A4. CRF: preprocess train data and build features
print("\nStep A4. CRF: preprocess train data and build features")
X_train, X_test, X_test_sentence, y_train, y_test = crf_preprocess(df_train_bio_small, df_test_bio_small)


Step A4. CRF: preprocess train data and build features
Data is preprocessed and ready for CRF model training.


In [8]:
# B. Train T5 and CRF models
print("\nB. Train T5 and CRF models")

# Step B1: Train T5 model
print("\nStep B1: Train T5 model")
t5_model, t5_tokenizer, t5_test_precision, t5_test_recall, t5_test_f1, t5_test_accuracy = t5_train_model(train_ds, eval_ds, test_ds)


B. Train T5 and CRF models

Step B1: Train T5 model


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.4238,1.394394,1.0,1.0,1.0,1.0
2,1.411,1.387383,1.0,1.0,1.0,1.0
3,1.409,1.384885,1.0,1.0,1.0,1.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Evaluation on Validation Set: 
Precision: 1.0000
Recall:    1.0000
F1 Score:  1.0000
Accuracy:  1.0000

Evaluation on Test Set: 
Precision: 0.9990
Recall:    1.0000
F1 Score:  0.9993
Accuracy:  0.9980


In [9]:
# Step B2: Train CRF model
print("\nStep B2: Train CRF model")

# Define the list of hyperparameter for cross validation
param_grid = {
    'c1': [0.001, 0.01, 0.1, 1, 10, 100],
    'c2': [0.001, 0.01, 0.1, 1, 10, 100]
}

crf_model, crf_test_precision, crf_test_recall, crf_test_f1, crf_test_accuracy = crf_train_model(X_train, X_test, y_train, y_test, param_grid)

crf_canon_test_precision, crf_canon_test_recall, crf_canon_test_f1, crf_canon_test_accuracy = crf_evaluate_canonical_predictions(crf_model, X_test, X_test_sentence, y_test)



Step B2: Train CRF model
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END .................................c1=0.001, c2=0.001; total time=   5.9s
[CV] END .................................c1=0.001, c2=0.001; total time=   5.7s
[CV] END .................................c1=0.001, c2=0.001; total time=   5.8s
[CV] END ..................................c1=0.001, c2=0.01; total time=   5.9s
[CV] END ..................................c1=0.001, c2=0.01; total time=   5.7s
[CV] END ..................................c1=0.001, c2=0.01; total time=   5.9s
[CV] END ...................................c1=0.001, c2=0.1; total time=   6.0s
[CV] END ...................................c1=0.001, c2=0.1; total time=   5.8s
[CV] END ...................................c1=0.001, c2=0.1; total time=   5.6s
[CV] END .....................................c1=0.001, c2=1; total time=   5.6s
[CV] END .....................................c1=0.001, c2=1; total time=   5.7s
[CV] END ............

In [10]:
# Step B3: Evaluate naive model
print("\nStep B3: Evaluate naive model")
naive_test_precision, naive_test_recall, naive_test_f1, naive_test_accuracy = naive_evaluate_canonical_predictions(stations, df_test_small)


Step B3: Evaluate naive model

Evaluation on Test Set: Exact Canonical Match

Classification Report
                                precision    recall  f1-score   support

                                     0.00      0.00      0.00         0
                   chong nonsi       0.00      0.00      0.00         1
  chong nonsi;national stadium       1.00      0.47      0.64        53
       chong nonsi;ratchadamri       1.00      0.53      0.69        36
       chong nonsi;saint louis       1.00      0.49      0.65        37
        chong nonsi;sala daeng       1.00      0.55      0.71        33
     chong nonsi;saphan taksin       1.00      0.59      0.74        29
              chong nonsi;siam       1.00      0.52      0.69        21
           chong nonsi;surasak       1.00      0.36      0.53        22
              national stadium       0.00      0.00      0.00         0
  national stadium;ratchadamri       1.00      0.62      0.77        69
  national stadium;saint louis    