In [None]:
# connect Google drive
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!sudo update-alternatives --config python3

In [None]:
# download packages
!pip install transformers
# !pip install torch
!pip install pandas
!pip install datasets
!pip install numpy

In [None]:
# create working dir
!mkdir ./data

In [None]:
# import packages
import transformers
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch
import sklearn
# import torch.utils.data as Data

In [None]:
cuda_available = torch.cuda.is_available()
print("Using CUDA:", cuda_available)

In [None]:
# regression
class JointScaler():
	def __init__(self):
		self.means = None
		self.stddev = None

	def fit_transform(self, data):
		self.means = np.mean(data, axis=0)
		centereddata = data - self.means
		self.stddev = np.std(centereddata)
		return centereddata / self.stddev

	def transform(self, data):
		return (data - self.means) / self.stddev

	def inverse_transform(self, data):
		return (data * self.stddev) + self.means

class YNormalizer():
	def __init__(self, settings):
		print("  Output normalizer settings:", settings)
		if settings == "indscale":
			self.scale = True
			self.scaler = sklearn.preprocessing.StandardScaler()
		elif settings == "jointscale":
			self.scale = True
			self.scaler = JointScaler()
		else:
			self.scale = False
			self.scaler = None

	def fit_transform(self, data):
		if self.scale:
			data = self.scaler.fit_transform(data)
		return data

	def transform(self, data):
		if self.scale:
			data = self.scaler.transform(data)
		return data

	def inverse_transform(self, data):
		if self.scale:
			data = self.scaler.inverse_transform(data)
		return data

# define globally to use it in custom loss function
normalizer = None

R = 6371
def evaluate(c1, c2, scale_km=True):
	d = np.radians(c2-c1)
	a = np.sin(d[:,0]/2) * np.sin(d[:,0]/2) + np.cos(np.radians(c1[:,0])) * np.cos(np.radians(c2[:,0])) * np.sin(d[:,1]/2) * np.sin(d[:,1]/2)
	d = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
	if scale_km:
		return R * d
	else:
		return d

def median_dist(a, b):
	global normalizer
	a_tr = normalizer.inverse_transform(a)
	b_tr = normalizer.inverse_transform(b)
	d = evaluate(a_tr, b_tr)
	return np.median(d)

def mean_dist(a, b):
	global normalizer
	a_tr = normalizer.inverse_transform(a)
	b_tr = normalizer.inverse_transform(b)
	d = evaluate(a_tr, b_tr)
	return np.mean(d)

In [None]:
# normalize lables
def get_normalizer(args):
  global normalizer
  # normalizer = YNormalizer("jointscale")
  normalizer = YNormalizer(args)
  # lables_sc = normalizer.fit_transform(lables)
  # return lables_sc
  return normalizer


In [None]:
# pd data pre load
import pandas as pd
import os
import numpy as np

class DatasetLoader():
	def __init__(self, filepath, filetype, sheet_name=None, usecols=None, output_dir="./data", drop_subset_duplicates=None):
		data = None
		if not os.path.exists(filepath):
			raise ValueError('file does not exist!')
		if filetype == "xlsx":
			try:
				data = pd.read_excel(filepath, sheet_name=sheet_name, usecols=usecols)
			except RuntimeError as e:
				print(e.args)
		elif filetype == "csv":
			data = pd.read_csv(filepath)
		if data.empty:
			raise RuntimeError('data empty!')
		if drop_subset_duplicates:
			data = data.drop_duplicates(subset=drop_subset_duplicates, keep="first")
		if not os.path.exists(output_dir):
			os.mkdir(output_dir)
		data.to_csv(os.path.join(output_dir, "data.csv"), index=None)

	def get_dataset(self, file_path="./data/data.csv", objtype="df", split=0.8, filter_subset=None):
		if not os.path.exists(file_path):
			raise ValueError('file does not exist!')
		data = None
		if objtype == "df":
			data = pd.read_csv(file_path)
			if filter_subset:
				data = data.dropna(subset=filter_subset)
		elif objtype == "ds":
			file_format = os.path.splitext(file_path)[-1]
			if file_format in ["csv", "json"]:
				data = data.load_dataset("csv", data_files=file_path)
			else:
				raise ValueError('file format not supported')
			if filter_subset:
				data = data.filter(lambda x: x[f] for f in filter_subset)
		else:
			# return list
			pass
		return data

file_path = "/content/drive/MyDrive/data/RM_Wyoming_georef_training_2022-07-11.xlsx"
data_dir = "/content/data"

obj_df = DatasetLoader(
    filepath = file_path,
    filetype = "xlsx",
    sheet_name = "RM_Woming_georef_training",
    usecols = [0, 8, 9, 16, 17, 18, 19, 23, 26, 27],
    output_dir = data_dir,
    drop_subset_duplicates = ['Country','State', 'County', 'Locality', 'GeorefLatitude', 'GeorefLongitude']
)

data_df = obj_df.get_dataset(file_path = os.path.join(data_dir, "data.csv"), objtype = "df", split = 0.8, filter_subset = ["Locality", 'GeorefLatitude', 'GeorefLongitude'])
print(data_df)

In [None]:
# normalize lables
normalizer = get_normalizer("jointscale")
checkpoint="roberta-base"

# tokenizer process
class MyTokenzier():
  def __init__(self, filepath, filetype, usecols=None, filter_fc=None, checkpoint="roberta-base", classifier=False):
    self.checkpoint = checkpoint
    self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
    self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)   
    self.dataset = load_dataset(filetype, data_files=filepath)
    self.class_dics = {}
     # clean & filter data
    if filter_fc:
      self.dataset = self.dataset.filter(filter_fc)
    # convert Family” and ScientificName to class ID
    if classifier:
      self.dataset = self.convert_classifier_id()
    # split data into train data and dev data
    self.dataset = self.dataset["train"].train_test_split(train_size=0.8, seed=42)
    self.dataset["dev"] = self.dataset.pop("test")
    # process labels
    self.train_y_sc, self.dev_y_sc = self.set_labels(classifier)
    print("  {} training examples with {} output features".format(len(self.train_y_sc), self.train_y_sc.shape[1]))
    # import pretrained model
    self.model = AutoModelForSequenceClassification.from_pretrained(self.checkpoint, num_labels = self.train_y_sc.shape[1])
    # print(self.dataset)

  def convert_classifier_id(self):
    def convert_dics(data):
      family = data["Family"]
      name = data["ScientificName"]
      id = len(self.class_dics)
      if family not in self.class_dics:
        self.class_dics[family] = [id*100, {}]
      family_id, family_ls = self.class_dics[family][0], self.class_dics[family][1]
      if name not in family_ls:
        class_id = family_id + len(family_ls) + 1
        self.class_dics[family][1][name] = class_id
      else:
        class_id = self.class_dics[family][1][name]
      return {"class_id": class_id }
    # print(self.class_dics)
    return self.dataset.map(convert_dics)

  def set_labels(self, classifier=None):
    train_y, dev_y = self.dataset["train"], self.dataset["dev"]
    train_y_sc = normalizer.fit_transform(np.array(np.c_[train_y["GeorefLatitude"], train_y["GeorefLongitude"]], dtype=np.float32)) 
    dev_y_sc = normalizer.transform(np.array(np.c_[dev_y["GeorefLatitude"], dev_y["GeorefLongitude"]], dtype=np.float32)) 
    if classifier:
      train_y_sc = np.array([np.append(x, train_y[i]["class_id"]) for i,x in enumerate(train_y_sc)], dtype=np.float32)
      dev_y_sc = np.array([np.append(x, dev_y[i]["class_id"]) for i,x in enumerate(dev_y_sc)], dtype=np.float32)
    return train_y_sc, dev_y_sc

  def tokenization(self, cols=["Locality"]):
    train_x, dev_x = self.dataset["train"], self.dataset["dev"]
    if len(cols) == 1:
      train_df = pd.DataFrame(zip(train_x[cols[0]], self.train_y_sc))
      train_df.columns = [cols[0], "labels"]
      dev_df = pd.DataFrame(zip(dev_x[cols[0]], self.dev_y_sc))
      dev_df.columns = [cols[0], "labels"]
    else:
      train_df = pd.DataFrame(zip(train_x[cols[0]], train_x[cols[1]], self.train_y_sc))
      train_df.columns = [cols[0], cols[1], "labels"]
      dev_df = pd.DataFrame(zip(dev_x[cols[0]], dev_x[cols[1]], self.dev_y_sc))
      dev_df.columns = [cols[0], cols[1], "labels"]
    
    # tokenizer
    def tokenize_function(data):
      return self.tokenizer(data[cols[0]], truncation=True) if len(cols) == 1 else self.tokenizer(data[cols[0]], data[cols[1]], truncation=True)
    
    train_ds = Dataset.from_pandas(train_df)
    train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=cols)
    dev_ds = Dataset.from_pandas(dev_df)
    dev_ds = dev_ds.map(tokenize_function, batched=True, remove_columns=cols)
    return train_ds, dev_ds


In [None]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./results',
    num_train_epochs = 20,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = './logs',
    metric_for_best_model = 'loss',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    prediction_loss_only = False,
    save_total_limit = 1,
    load_best_model_at_end=True,
) 

In [None]:
# trainer
class RoBertaTrainer(Trainer):
  def __init__(self, loss_fct="MSELoss", *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.loss_fct = loss_fct
    # print(self.loss_fct)
    # print(self.model)
    if self.loss_fct == "MAELoss":
      self.loss_fct = torch.nn.MSELoss()
    elif loss_fct == "MSELoss":
      self.loss_fct = torch.nn.L1Loss()
    elif loss_fct == "CrossEntropyLoss":
      self.loss_fct = torch.nn.CrossEntropyLoss()

  def compute_loss(self, model, inputs, return_outputs=False):
    # implement custom logic here
    labels = inputs.get("labels")
    # forward pass
    outputs = model(**inputs)
    logits = outputs.get("logits")
    # compute custom loss
    loss = self.loss_fct(logits.view(-1, model.num_labels), labels.view(-1, model.num_labels))
    return (loss, outputs) if return_outputs else loss


In [None]:
# Compute Metrics
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    # print("eval_pred: ", eval_pred)
    # print("logits: ", logits)
    median = median_dist(logits, labels)
    mean = mean_dist(logits, labels)
    return {"Median(km)": median, "Mean(km)": mean}

In [None]:
# Experiment 1
# [Locality] -> [lat, lon], MAE/MSE
tokens = MyTokenzier("./data/data.csv", "csv", checkpoint=checkpoint, filter_fc=(lambda x: x["Locality"] is not None and x["SiteDescription"] is not None and x["GeorefLatitude"] is not None and x["GeorefLongitude"] is not None))
train_ds, dev_ds = tokens.tokenization(cols=["Locality"])
print(train_ds)
print('train_df_data', train_ds[:10])
print('train_df features', train_ds.features)

loss_fct = "MAELoss"
# loss_fct = "MSELoss"

# Call the Trainer
trainer = RoBertaTrainer(
    model = tokens.model,                         
    args = training_args,                  
    train_dataset = train_ds,         
    eval_dataset = dev_ds,          
    compute_metrics = compute_metrics_for_regression,    
    tokenizer=tokens.tokenizer,
    data_collator=tokens.data_collator, 
    loss_fct=loss_fct
)

# Train the model
trainer.train()
# Call the summary
trainer.evaluate()

In [None]:
# Experiment 2
# [Locality, SiteDescription] -> [lat, lon], MAE/MSE
tokens = MyTokenzier("./data/data.csv", "csv", checkpoint=checkpoint, filter_fc=(lambda x: x["Locality"] is not None and x["SiteDescription"] is not None and x["GeorefLatitude"] is not None and x["GeorefLongitude"] is not None))
train_ds, dev_ds = tokens.tokenization(cols=["Locality", "SiteDescription"])
print(train_ds)
print('train_df_data', train_ds[:10])
print('train_df features', train_ds.features)

# loss_fct = "MAELoss"
loss_fct = "MSELoss"

# Call the Trainer
trainer = RoBertaTrainer(
    model = tokens.model,                         
    args = training_args,                  
    train_dataset = train_ds,         
    eval_dataset = dev_ds,          
    compute_metrics = compute_metrics_for_regression,    
    tokenizer=tokens.tokenizer,
    data_collator=tokens.data_collator, 
    loss_fct=loss_fct
)

# Train the model
trainer.train()
# Call the summary
trainer.evaluate()

In [None]:
# trainer
class RoBertaClassifierTrainer(Trainer):
  def __init__(self, loss_fct="MSELoss", classifier="CrossEntropyLoss", *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.loss_fct = loss_fct
    # print(self.loss_fct)
    # print(self.model)
    if self.loss_fct == "MAELoss":
      self.loss_fct = torch.nn.MSELoss()
    elif loss_fct == "MSELoss":
      self.loss_fct = torch.nn.L1Loss()
    self.classifier = torch.nn.CrossEntropyLoss()

  def compute_loss(self, model, inputs, return_outputs=False):
    # implement custom logic here
    labels = inputs.get("labels")
    # forward pass
    outputs = model(**inputs)
    logits = outputs.get("logits")
    reg_logits, classf_logits = torch.tensor_split(logits, 2, dim=1)
    # print(reg_logits, classf_logits)
    # print(labels)
    reg_labels, classf_labels = torch.tensor_split(labels, 2, dim=1)
    # compute custom loss
    loss = self.loss_fct(reg_logits.view(-1, model.num_labels-1), reg_labels.view(-1, model.num_labels-1)) + self.classifier(classf_logits.view(-1, model.num_labels-2), classf_labels.view(-1, model.num_labels-2))
    return (loss, outputs) if return_outputs else loss


# Compute Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # print("eval_pred: ", eval_pred)
    # print("logits: ", logits)
    reg_logits = np.array(list(map(lambda x: [x[0], x[1]], logits)), dtype=np.float32)
    reg_labels = np.array(list(map(lambda x: [x[0], x[1]], labels)), dtype=np.float32)
    print(reg_logits.shape, reg_labels.shape)
    median = median_dist(reg_logits, reg_labels)
    mean = mean_dist(reg_logits, reg_labels)
    return {"Median(km)": median, "Mean(km)": mean}

In [None]:
# Experiment 3
# [Locality, Famliy, ScientificName] -> [lat, lon], MAE+Classifier(CrossEntropyLoss) / MSE+Classifier(CrossEntropyLoss)tokens = MyTokenzier("./data/data.csv", "csv", checkpoint=checkpoint, classifier=True, filter_fc=(lambda x: x["Family"] is not None and x["ScientificName"] is not None and x["Locality"] is not None and x["SiteDescription"] is not None and x["GeorefLatitude"] is not None and x["GeorefLongitude"] is not None))
train_ds, dev_ds = tokens.tokenization(cols=["Locality"])
print(train_ds)
print('train_df_data', train_ds[:10])
print('train_df features', train_ds.features)

# Call the Trainer
classifier = "CrossEntropyLoss"
# loss_fct = "MAELoss"
loss_fct = "MSELoss"

# MAE
trainer = RoBertaClassifierTrainer(
    model = tokens.model,                         
    args = training_args,                  
    train_dataset = train_ds,         
    eval_dataset = dev_ds,          
    compute_metrics = compute_metrics,    
    tokenizer=tokens.tokenizer,
    data_collator=tokens.data_collator, 
    loss_fct=loss_fct,
    classifier=classifier
)

# Train the model
trainer.train()
# Call the summary
trainer.evaluate()

In [None]:
# Experiment 4
# [Locality, SiteDescription, Famliy, ScientificName] -> [lat, lon], MAE+Classifier(CrossEntropyLoss) / MSE+Classifier(CrossEntropyLoss)
tokens = MyTokenzier("./data/data.csv", "csv", checkpoint=checkpoint, classifier=True, filter_fc=(lambda x: x["Family"] is not None and x["ScientificName"] is not None and x["Locality"] is not None and x["SiteDescription"] is not None and x["GeorefLatitude"] is not None and x["GeorefLongitude"] is not None))
train_ds, dev_ds = tokens.tokenization(cols=["Locality", "SiteDescription"])
print(train_ds)
print('train_df_data', train_ds[:10])
print('train_df features', train_ds.features)

# Call the Trainer
classifier = "CrossEntropyLoss"
# loss_fct = "MAELoss"
loss_fct = "MSELoss"

# MAE
trainer = RoBertaClassifierTrainer(
    model = tokens.model,                         
    args = training_args,                  
    train_dataset = train_ds,         
    eval_dataset = dev_ds,          
    compute_metrics = compute_metrics,    
    tokenizer=tokens.tokenizer,
    data_collator=tokens.data_collator, 
    loss_fct=loss_fct,
    classifier=classifier
)

# Train the model
trainer.train()
# Call the summary
trainer.evaluate()

In [None]:
# test & predict
