In [None]:
!pip install transformers
!pip install evaluate
!pip install datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import pandas as pd
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import classification_report
import csv
import json

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021")

In [None]:
train_data_es = pd.read_csv("labeled_train_es.csv")
dev_data_es = pd.read_csv("labeled_dev_es.csv")

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["translated_tweet"], padding="max_length", truncation=True, max_length=130)

In [None]:
train_dataset = Dataset.from_pandas(train_data_es)
dev_dataset = Dataset.from_pandas(dev_data_es)

In [None]:
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_dev_data = dev_dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021",
                                                           num_labels=2)

In [None]:
metric = evaluate.load("leslyarun/fbeta_score")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1": report["macro avg"]["f1-score"],
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f2": metric.compute(predictions=predictions, references=labels, beta=2),
    }

In [None]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=1)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_dev_data,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("output")

In [None]:
# loading the model you previously trained
model = AutoModelForSequenceClassification.from_pretrained("output")

# init trainer
trainer = Trainer(
              model = model)

# Inference

In [None]:
# Define function to classify a tweet and return the result in the desired format
def classify_tweet(tweet):
    
    encoding = tokenizer(tweet, return_tensors="pt")

    encoded_input_trc={}
    for k,v in encoding.items():
        v_truncated = v[:,:128]
        encoded_input_trc[k]=v_truncated.to(trainer.model.device)
    
    outputs = trainer.model(**encoded_input_trc)
    probas = outputs.logits.softmax(dim=-1)
    predicted_label = probas.argmax().item()
    soft_label = {"YES": probas[0, 1].item(), "NO": probas[0, 0].item()}
    if predicted_label == 1:
        hard_label = "YES"
    else:
        hard_label = "NO"
    return {"hard_label": hard_label, "soft_label": soft_label}


# Define function to classify all tweets in a csv file and return results in desired format
def classify_csv(csv_file):
    tweets = {}
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            tweet_id, preprocessed_text, translated_tweet, label = row
            tweets[tweet_id] = translated_tweet
    results = {}
    for tweet_id, translated_tweet in tweets.items():
        result = classify_tweet(translated_tweet)
        results[tweet_id] = result
    return results


# Classify train, dev, and test data and save results in json files
train_results = classify_csv("labeled_train_es.csv")
print('train completed')
dev_results = classify_csv("labeled_dev_es.csv")
print('dev completed')

with open("train_results.json", "w") as f:
    json.dump(train_results, f)
with open("dev_results.json", "w") as f:
    json.dump(dev_results, f)


In [None]:
# Define function to classify a tweet and return the result in the desired format
def classify_tweet(tweet):
    
    encoding = tokenizer(tweet, return_tensors="pt")

    encoded_input_trc={}
    for k,v in encoding.items():
        v_truncated = v[:,:128]
        encoded_input_trc[k]=v_truncated.to(trainer.model.device)
    
    
    outputs = trainer.model(**encoded_input_trc)
    probas = outputs.logits.softmax(dim=-1)
    predicted_label = probas.argmax().item()
    soft_label = {"YES": probas[0, 1].item(), "NO": probas[0, 0].item()}
    if predicted_label == 1:
        hard_label = "YES"
    else:
        hard_label = "NO"
    return {"hard_label": hard_label, "soft_label": soft_label}


# Define function to classify all tweets in a csv file and return results in desired format
def classify_csv(csv_file):
    tweets = {}
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader) # skip header row
        for row in reader:
            tweet_id, preprocessed_text, translated_tweet = row
            tweets[tweet_id] = translated_tweet
    results = {}
    for tweet_id, translated_tweet in tweets.items():
        result = classify_tweet(translated_tweet)
        results[tweet_id] = result
    return results
    

test_results = classify_csv("translated_test_es.csv")

with open("test_results.json", "w") as f:
    json.dump(test_results, f)