In [None]:
# Import necessary modules
import os
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer

Cloning into 'byt5-geotagging'...
remote: Enumerating objects: 226, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 226 (delta 46), reused 11 (delta 8), pack-reused 146[K
Receiving objects: 100% (226/226), 13.02 MiB | 18.02 MiB/s, done.
Resolving deltas: 100% (92/92), done.


In [None]:
cd /content

/content


In [None]:

# Clone the repository for geotagging with BYT5
!git clone https://github.com/Yachay-AI/byt5-geotagging

# Install required packages
!pip install transformers==4.29.1 tqdm==4.63.2 pandas==1.4.4 wandb

# Install gdown for Google Drive downloads
!pip install gdown

# Download the dataset
!gdown https://drive.google.com/u/2/uc?id=1thkE-hgT3sDtZqILZH17Hyayy0hkk_jh&export=download

# Unzip the downloaded dataset
!tar xvf challenge_1.tar.gz > /dev/null

# Read sample data
!head data_sample_lc/c_46.json


In [None]:

# Initialize an empty list to hold dataframes
df_list = []

# Loop through each JSON file and append it to df_list
for fn in os.listdir("data_sample_lc"):
  df_list.append(pd.read_json(f"data_sample_lc/{fn}", lines=True))

# Concatenate all dataframes in df_list
df = pd.concat(df_list)

# Extract latitude and longitude from 'coordinates' column
df['lat'] = [x[1] for x in df['coordinates']]
df['lon'] = [x[0] for x in df['coordinates']]

# Drop the 'coordinates' column
df.drop('coordinates', axis=1, inplace=True)

# Shuffle the dataset
df = df.sample(frac=1.0)

# Split dataset into training and testing sets
df.iloc[:len(df)*9//10].to_csv('train.csv')
df.iloc[len(df)*9//10:].to_csv('test.csv')

# Count the number of lines in train.csv and test.csv
!wc -l train.csv
!wc -l test.csv

In [None]:

# Load cluster data
cluster_df = pd.read_csv('byt5-geotagging/cluster_df.csv')

# Save the clustering model
with open('clustering.pkl', 'wb') as fout:
  pickle.dump((cluster_df, []), fout)


In [None]:

# Run the training script
# The parameters here are chosen to show a small training run on a small subset of data
!python byt5-geotagging/train_model.py --train_input_file train.csv --test_input_file test.csv --do_train true --do_test true --load_clustering ./ --device cuda --batch_size 64 --keep_layer_count 4 --max_train 96000 --max_test 640


INFO:root:start
start
INFO:root:finish reading test file
finish reading test file
INFO:root:finish reading train file
finish reading train file
INFO:root:Index(['Unnamed: 0', 'text', 'lat', 'lon', 'coordinates'], dtype='object')
Index(['Unnamed: 0', 'text', 'lat', 'lon', 'coordinates'], dtype='object')
Some weights of the model checkpoint at google/byt5-small were not used when initializing T5EncoderModel: ['decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.1.EncDecAttention.o.weight', 'decoder.block.3.layer.1.EncDecAttention.v.weight', 'decoder.block.3.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.3.layer.1.EncDecAttention.q.weight', '

In [None]:

# Run the testing script
# For demo purpose, a small subset of validation set is used
!python byt5-geotagging/train_model.py --train_input_file train.csv --test_input_file test.csv --do_test true --load_clustering ./ --load_model_dir models/byt5-class-0 --device cuda --batch_size 32  --max_train 96000 --max_test 640


INFO:root:start
start
INFO:root:finish reading test file
finish reading test file
INFO:root:finish reading train file
finish reading train file
INFO:root:Index(['Unnamed: 0', 'text', 'lat', 'lon', 'coordinates'], dtype='object')
Index(['Unnamed: 0', 'text', 'lat', 'lon', 'coordinates'], dtype='object')
INFO:root:model loaded
model loaded
100% 20/20 [00:05<00:00,  4.00it/s]
INFO:root:Epoch 3 eval loss 5.908099246025086  accuracy 0.028125 true distance avg 5734.2490234375 true distance median 3978.244140625
Epoch 3 eval loss 5.908099246025086  accuracy 0.028125 true distance avg 5734.2490234375 true distance median 3978.244140625
INFO:root:threshold 0 MAE 5734.249003141335 Median 3978.2442709348024 percentage 1.0 acc 0.028125 precision 0.0984375 recall 1.0 f1@500 0.17923186344238975
threshold 0 MAE 5734.249003141335 Median 3978.2442709348024 percentage 1.0 acc 0.028125 precision 0.0984375 recall 1.0 f1@500 0.17923186344238975
  part_true_distance_ls = pd.Series(part_true_distance_ls)
  f

In [None]:
cd byt5-geotagging

/content/byt5-geotagging


In [None]:

# Load trained model and tokenizer
device = 'cuda'
byt5 = torch.load('../models/byt5-class-0')
byt5_tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

# Make a prediction
text = 'I live in New York'
inputs = byt5_tokenizer(text, return_tensors='pt')['input_ids'].unsqueeze(0)
logits = byt5.to(device)(inputs.to(device))
predicted_cluster = logits.argmax()
confidence = torch.nn.functional.softmax(logits, dim=-1).max().item()
predicted_location = cluster_df.iloc[predicted_cluster.item()]

# Output predicted location and confidence
print(predicted_location['lat'], predicted_location['lng'], confidence)

41.275721 -96.053431 0.018365390598773956
