In [1]:
# Imports
!pip install -q transformers[torch] datasets emoji accelerate evaluate
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from tokenizers import BertWordPieceTokenizer
import emoji
import torch
import torch.nn.functional as F
import re

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/542.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90

In [2]:
# Eliminar saltos de línea y espacios repetidos
def delete_spaces(comment):
    spaces_pattern = r'[\n\r]+|\s+'
    return re.sub(spaces_pattern, ' ', comment)

# Convertir todo a minúsculas
def lower_text(comment):
    return comment.lower()

# Eliminar URL
def delete_urls(comment):
    url_pattern = r'http[s]?://\S+'
    return re.sub(url_pattern, '', comment)

# Eliminar consonantes repetidas y puntos suspensivos
def delete_repeated_consonants(comment):
    repeated_consonant_pattern = r'([^aeiou\s\r\n0-9])\1{1,}'
    def replace(match):
        char = match.group(1)
        if char in 'rcnl':
            return char * 2
        else:
            return char

    return re.sub(repeated_consonant_pattern, replace, comment, flags=re.IGNORECASE)

# Dejar como máximo 2 vocales iguales contiguas
def delete_repeated_vowels(comment):
    repeated_vowels_pattern = r'([aeiouAEIOU])\1{2,}'
    return re.sub(repeated_vowels_pattern, r'\1\1', comment, flags=re.IGNORECASE)

# Eliminar acentos no empleados en Español
def delete_accents (comment):
    comment = re.sub(r"[àâãäå]", "a", comment)
    comment = re.sub(r"ç", "c", comment)
    comment = re.sub(r"[èêë]", "e", comment)
    comment = re.sub(r"[ìîï]", "i", comment)
    comment = re.sub(r"[òôõö]", "o", comment)
    comment = re.sub(r"[ùû]", "u", comment)
    comment = re.sub(r"[ýÿ]", "y", comment)
    return comment

# Eliminar caracteres inusuales
def delete_characters(comment):
    special_characters = r'[ºª|·~¬\^`[\]¨´#\\\'\(\)*\<>_]'
    return re.sub(special_characters, '', comment)

# Eliminar otros caracteres inusuales
def delete_specific_characters(comment):
    specific_chars_pattern = r'[«»✈ºø♀♂€🇮±°ª‘’“”🇱]'
    return re.sub(specific_chars_pattern, '', comment)

# Eliminar emoticonos
def delete_emoticons(comment):
    return emoji.replace_emoji(comment, replace='')

# Eliminar caracteres árabes
def delete_arabic_letters(text):
    arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]'
    return re.sub(arabic_pattern, '', text)

# Unificar las distintas formas de expresar la risa
def unify_laughs (comment):
    laugh_pattern = r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+|j+e+j+[ej]*|j+s+j+[sj]*|j+i+j+[ij]*)\b"
    return re.sub(laugh_pattern, 'jaja', comment, flags=re.IGNORECASE)

def delete_extra (comment):
  return re.sub(r'\bmikeldi\b', '', comment, flags=re.IGNORECASE)

# Función para preprocesar el texto
def preprocess_comment(comment):
    comment = delete_spaces(comment)
    comment = lower_text(comment)
    comment = delete_urls(comment)
    comment = delete_repeated_consonants(comment)
    comment = delete_repeated_vowels(comment)
    comment = delete_accents(comment)
    comment = delete_characters(comment)
    comment = delete_specific_characters(comment)
    comment = delete_emoticons(comment)
    comment = delete_arabic_letters(comment)
    comment = unify_laughs(comment)
    comment = delete_extra(comment)
    return comment

In [3]:
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
# Cargar el dataset
database = "amaiaruvi/news_racist_comments_spanish"
dataset = load_dataset(database)
dataset

Downloading readme:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/406k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3005 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/438 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/851 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 3005
    })
    validation: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 438
    })
    test: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 851
    })
})

In [5]:
# Cargar el modelo
modelo = "dccuchile/bert-base-spanish-wwm-uncased"
model = AutoModelForSequenceClassification.from_pretrained(modelo, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(modelo)

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [6]:
model.config

BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31002
}

In [7]:
tokenizer

BertTokenizerFast(name_or_path='dccuchile/bert-base-spanish-wwm-uncased', vocab_size=31002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [9]:
tokenizer.vocab_size

31002

In [10]:
tokenizer.model_max_length

512

In [11]:
tokenizer.get_vocab()

{'búsqueda': 5701,
 'estad': 3750,
 'indebido': 21069,
 'alabado': 28776,
 'ren': 2526,
 'quienes': 4103,
 'sancho': 21390,
 'níger': 21855,
 'campeonatos': 16841,
 'tejido': 11440,
 '##campe': 17626,
 'criados': 26503,
 '##essa': 19213,
 'envenen': 12036,
 'visu': 16608,
 'boxeo': 19821,
 '##cam': 9875,
 'traerme': 21033,
 'religioso': 11277,
 'distribuidos': 22037,
 'fundaron': 26400,
 'frito': 24293,
 'bruto': 11978,
 'adulto': 12765,
 'permanezca': 26162,
 'sabor': 10708,
 '##érmelo': 22486,
 'intervenido': 30211,
 'creencias': 12441,
 'peti': 5611,
 'impe': 6713,
 'del': 1081,
 'quédense': 17633,
 'terminemos': 23014,
 'disf': 3612,
 'realizaba': 24219,
 'declarando': 29566,
 'conten': 3012,
 'criterios': 6744,
 'exclusiva': 12630,
 'ángulo': 13491,
 'cazando': 27734,
 'abogado': 4682,
 'pechos': 18609,
 '##ián': 15062,
 'puse': 9410,
 'incendio': 10186,
 'accesorios': 17013,
 'votantes': 17766,
 'mada': 15616,
 'contando': 12024,
 'restau': 5669,
 '##bie': 9084,
 'obligaron': 247

In [12]:
# Entrenar un tokenizador nuevo con los comentarios del conjunto de entrenamiento
new_tokenizer = BertWordPieceTokenizer(lowercase=True)
texts = [preprocess_comment(ex["comment"]) for ex in dataset['train']]
new_tokenizer.train_from_iterator(
    texts, min_frequency=10
)

In [13]:
new_tokenizer

Tokenizer(vocabulary_size=2460, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [14]:
# Calcular cuántos tokens nuevos obtenemos
old_tokens = set(tokenizer.get_vocab())
missing_tokens = [tok for tok in new_tokenizer.get_vocab() if tok not in old_tokens]
len(missing_tokens)

262

In [15]:
# Estos son los tokens obtenidos
for i, tok in enumerate(sorted(missing_tokens)):
    print(f"{i+1:<4} {tok}")

1    ##acion
2    ##adron
3    ##aj
4    ##aja
5    ##ajaj
6    ##ajajajaj
7    ##alicion
8    ##amb
9    ##ascis
10   ##asion
11   ##baj
12   ##cacion
13   ##capaci
14   ##ccion
15   ##cto
16   ##cuen
17   ##dian
18   ##dran
19   ##ectivo
20   ##ecuen
21   ##esion
22   ##estar
23   ##fier
24   ##fob
25   ##guenza
26   ##guir
27   ##icacion
28   ##iculo
29   ##inci
30   ##iria
31   ##isimo
32   ##isticas
33   ##itantes
34   ##izacion
35   ##kaldo
36   ##lacion
37   ##mig
38   ##nacion
39   ##nif
40   ##obre
41   ##oci
42   ##ocia
43   ##oll
44   ##omb
45   ##omp
46   ##osi
47   ##osicion
48   ##ote
49   ##otros
50   ##oz
51   ##paci
52   ##patr
53   ##puer
54   ##rab
55   ##racion
56   ##ramente
57   ##resp
58   ##rop
59   ##tacion
60   ##tb
61   ##tler
62   ##tracion
63   ##tumb
64   ##tuza
65   ##tx
66   ##tza
67   ##uecos
68   ##uen
69   ##uena
70   ##uente
71   ##uerdo
72   ##uestra
73   ##uir
74   ##upar
75   ##ustr
76   ##usval
77   ##utacion
78   ##vidu
79   ##xu
80   ##xuales
8

In [16]:
# Algunos tokens no nos interesan
tokens_to_delete = [
    "##aj",
    "##aja",
    "##ajaj",
    "##ajajajaj",
    "–",
    "—",
    "…"
]
add_tokens = [t for t in missing_tokens if t not in tokens_to_delete and t.isdigit() == False]
len(add_tokens)

249

In [17]:
# Añadimos los tokens al tokenizador. Aparecen como added token
tokenizer.add_tokens(add_tokens)
tokenizer

BertTokenizerFast(name_or_path='dccuchile/bert-base-spanish-wwm-uncased', vocab_size=31002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	31002: AddedToken("##ecuen", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	3100

In [18]:
# Reemplazamos el tokenizador del modelo
model.resize_token_embeddings(len(tokenizer))

Embedding(31251, 768)

Ahora hay que reentrenar el modelo:

In [19]:
print("Preprocessing data...")
preprocessed_data = dataset.map(lambda ex: {
    "comment": preprocess_comment(ex["comment"]),
    "title": preprocess_comment(ex["title"]),
    "label": ex["racist"]
})

Preprocessing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [20]:
def custom_tokenizer(examples):
    return tokenizer(
        examples["comment"],
        examples["title"],
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
    )

In [21]:
preprocessed_data['test'][1]

{'link': 'https://okdiario.com/espana/vox-empapela-gerona-carteles-arabe-estas-espana-hombres-mujeres-tienen-mismos-derechos-12797483',
 'title': 'vox empapela gerona con carteles en árabe: estás en españa, hombres y mujeres tienen los mismos derechos',
 'comment': 'mira quien habla, los de los tiros en la nuca.',
 'racist': 0,
 'label': 0}

In [22]:
tokenized = custom_tokenizer(preprocessed_data['test'][1])
tokens_strings = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

print("Texto a tokenizar:", preprocessed_data['test'][1]['comment'], ' + ', preprocessed_data['test'][1]['title'])
print("Tokens:", tokens_strings)
print("\n\ninput_ids:", tokenized['input_ids'])
print("token_type_ids:", tokenized['token_type_ids'])
print("attention_mask:", tokenized['attention_mask'])

Texto a tokenizar: mira quien habla, los de los tiros en la nuca.  +  vox empapela gerona con carteles en árabe: estás en españa, hombres y mujeres tienen los mismos derechos
Tokens: ['[CLS]', 'mira', 'quien', 'habla', ',', 'los', 'de', 'los', 'tiros', 'en', 'la', 'nuca', '.', '[SEP]', 'vox', 'empa', '##pel', '##a', 'ger', '##ona', 'con', 'carteles', 'en', 'árabe', ':', 'estás', 'en', 'españa', ',', 'homb', 'res', 'y', 'mujeres', 'tie', 'ne', '##n', 'los', 'mismos', 'dere', 'cho', '##s', '[SEP]']


input_ids: [4, 2065, 1925, 2892, 1019, 1067, 1009, 1067, 15737, 1035, 1032, 27509, 1008, 5, 31192, 7431, 13722, 30956, 4601, 1791, 1048, 19949, 1035, 6338, 995, 1499, 1035, 2942, 1019, 31200, 1196, 1040, 2209, 31041, 1504, 30959, 1067, 4549, 31103, 5414, 30958, 5]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [23]:
print("Tokenizing data...")
encoded_data = preprocessed_data.map(custom_tokenizer, batched=True)
encoded_data = encoded_data.remove_columns(['link', 'title', 'comment', 'racist'])

Tokenizing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [24]:
# Aquí cambiaríamos los hiperparámetros
epochs = 8
batch_size = 8
learning_rate = 2.5e-5
weight_decay = 0.3

warmup_proportion = 0.1
total_steps = (epochs * len(dataset['train'])) / batch_size
warmup_steps = int(warmup_proportion * total_steps)

training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    evaluation_strategy="epoch",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    eval_accumulation_steps=1,
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['validation'],
    args=training_args,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

)



In [25]:
# Entrenamiento
print("Training the model...")
trainer.train()

Training the model...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.475019,0.826484,0.687671,0.783814,0.659454
2,0.426000,0.523399,0.835616,0.758048,0.764294,0.752491
3,0.245000,0.745979,0.851598,0.773679,0.79306,0.759154
4,0.067400,1.043266,0.847032,0.749853,0.799634,0.723529
5,0.067400,1.139504,0.8379,0.748648,0.772769,0.732173
6,0.011500,1.170753,0.853881,0.783276,0.79272,0.77515
7,0.003900,1.223369,0.842466,0.761692,0.77722,0.74964
8,0.000100,1.241985,0.842466,0.761692,0.77722,0.74964


TrainOutput(global_step=3008, training_loss=0.12530298601974751, metrics={'train_runtime': 1525.728, 'train_samples_per_second': 15.756, 'train_steps_per_second': 1.972, 'total_flos': 3943435525528920.0, 'train_loss': 0.12530298601974751, 'epoch': 8.0})

In [26]:
# Evaluar el modelo con el conjunto de validación: se queda con el mejor valor
print("Evaluating with validation set.")
trainer.evaluate()

Evaluating with validation set.


{'eval_loss': 1.2419854402542114,
 'eval_accuracy': 0.8424657534246576,
 'eval_f1': 0.7616918600525158,
 'eval_precision': 0.7772204632561239,
 'eval_recall': 0.7496398559423769,
 'eval_runtime': 6.4267,
 'eval_samples_per_second': 68.153,
 'eval_steps_per_second': 8.558,
 'epoch': 8.0}

In [27]:
# Predicciones
print("Predictions:")
test_predictions = trainer.predict(encoded_data["test"])
y_true = test_predictions.label_ids

logits = test_predictions.predictions
# Convertir los logits a un tensor de PyTorch
logits_tensor = torch.tensor(logits)
# Aplicar la función softmax a los logits para obtener probabilidades
probabilities = F.softmax(logits_tensor, dim=1)
# Obtener las clases predichas (índice de la probabilidad más alta)
y_pred = torch.argmax(probabilities, dim=1)
reporte = classification_report(y_true, y_pred, output_dict=False)
print(reporte)

Predictions:


              precision    recall  f1-score   support

           0       0.89      0.93      0.91       654
           1       0.73      0.63      0.68       197

    accuracy                           0.86       851
   macro avg       0.81      0.78      0.79       851
weighted avg       0.86      0.86      0.86       851



In [28]:
new_model_name = "beto-finetuned-racist-news-comments-spanish"
new_model_path = f"./models/{new_model_name}"

model.save_pretrained(new_model_path)
tokenizer.save_pretrained(new_model_path)

('./models/beto-finetuned-racist-news-comments-spanish/tokenizer_config.json',
 './models/beto-finetuned-racist-news-comments-spanish/special_tokens_map.json',
 './models/beto-finetuned-racist-news-comments-spanish/vocab.txt',
 './models/beto-finetuned-racist-news-comments-spanish/added_tokens.json',
 './models/beto-finetuned-racist-news-comments-spanish/tokenizer.json')

In [29]:
model.push_to_hub("amaiaruvi/beto-finetuned-racist-news-comments-spanish")
tokenizer.push_to_hub("amaiaruvi/beto-finetuned-racist-news-comments-spanish")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/amaiaruvi/beto-finetuned-racist-news-comments-spanish/commit/7eb75b39d1d81317e40c076c1a4261986ae1683d', commit_message='Upload tokenizer', commit_description='', oid='7eb75b39d1d81317e40c076c1a4261986ae1683d', pr_url=None, pr_revision=None, pr_num=None)