# Data Exploration

## Setup

In [45]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neighbors
import sklearn.ensemble
import sklearn.cluster
import sklearn.feature_selection
import matplotlib.pyplot as plt
import ast
import datetime
import torch
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import json

In [2]:
train_file = open('data/train_set.json')
train_json = json.load(train_file)
train_json

[{'id': 0,
  'text': "Remains dating to the 5th century were found in tomb in Hwangnam-dong . Show a man 's bones on top of a woman 's who was buried with jewellery . Experts believe Silla Dynasty-era tomb was built for a noblewoman and her lover or bodyguard was sacrificed and buried on top of her . There 's a suggestion that the set-up may have been designed to show two people having sex - and the Silla were known for their explicit pottery .",
  'label': 1},
 {'id': 1,
  'text': "Professional Identification is a type of social identification and is the sense of oneness individuals have with a profession (e.g. law, medicine) and the degree to which individuals define themselves as profession members. Professional identity consists of the individual's alignment of roles, responsibilities, values, and ethical standards to be consistent with practices accepted by their specific profession. Sources of professional identification Researchers have found that a desire for quality (rather th

In [3]:
type(train_json), len(train_json)

(list, 4000)

In [29]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
tokens = tokenizer(train_json[0]['text'])
tokens

['Professional',
 'Identification',
 'is',
 'a',
 'type',
 'of',
 'social',
 'identification',
 'and',
 'is',
 'the',
 'sense',
 'of',
 'oneness',
 'individuals',
 'have',
 'with',
 'a',
 'profession',
 '(',
 'e.g.',
 'law',
 ',',
 'medicine',
 ')',
 'and',
 'the',
 'degree',
 'to',
 'which',
 'individuals',
 'define',
 'themselves',
 'as',
 'profession',
 'members',
 '.',
 'Professional',
 'identity',
 'consists',
 'of',
 'the',
 'individual',
 "'s",
 'alignment',
 'of',
 'roles',
 ',',
 'responsibilities',
 ',',
 'values',
 ',',
 'and',
 'ethical',
 'standards',
 'to',
 'be',
 'consistent',
 'with',
 'practices',
 'accepted',
 'by',
 'their',
 'specific',
 'profession',
 '.',
 'Sources',
 'of',
 'professional',
 'identification',
 'Researchers',
 'have',
 'found',
 'that',
 'a',
 'desire',
 'for',
 'quality',
 '(',
 'rather',
 'than',
 'profits',
 ')',
 'is',
 'associated',
 'with',
 'professional',
 'identification',
 '.',
 'Organizations',
 'tend',
 'to',
 'be',
 'concerned',
 'wit

In [62]:
y = []
X = []
for sample in train_json:
    X.append({'id': sample['id'], 'text': sample['text']})
    y.append(sample['label'])

X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y, test_size=0.1, random_state=0)

In [35]:
def yield_tokens(json):
    for sample in json:
        yield tokenizer(sample['text'])
vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=["<bos>", "<eos>", "<unk>", "<pad>"])
vocab.set_default_index(2)

In [36]:
len(vocab)

35848

In [37]:
vocab.lookup_indices(tokenizer(X_val[0]['text']))

[8763,
 7114,
 8,
 3409,
 3917,
 832,
 9,
 10,
 33486,
 365,
 16,
 2,
 4,
 268,
 72,
 1355,
 7114,
 8,
 3917,
 539,
 2052,
 9,
 10,
 29360,
 603,
 6590,
 4,
 14,
 365,
 5727,
 17,
 10,
 2,
 25,
 6,
 17781,
 9,
 6,
 6483,
 4]

# Quick HuggingFace baseline

In [38]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [61]:
from transformers import AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

loading configuration file config.json from cache at C:\Users\caioj/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at C:\Users\caioj/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\vocab.txt
loa

In [72]:
{**bert_tokenizer(X_train[0]['text']), 'label': 1}

{'input_ids': [101,
  170,
  1643,
  13217,
  1367,
  117,
  2481,
  117,
  22572,
  1813,
  7174,
  172,
  23403,
  9945,
  3276,
  1108,
  17493,
  1120,
  22572,
  4578,
  2758,
  3828,
  2371,
  119,
  171,
  12809,
  2448,
  1162,
  19858,
  1103,
  1285,
  1119,
  1108,
  17493,
  119,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': 1}

In [79]:
from transformers import TrainingArguments, Trainer, ProgressCallback
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="lightning_logs", evaluation_strategy="epoch", per_device_train_batch_size=1, per_device_eval_batch_size=1)

callback = ProgressCallback()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [76]:
class TextDataset(Dataset):
    def __init__(self, X, y, vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def __getitem__(self, item):
        return {**bert_tokenizer(self.X[item]['text'][:512]), 'label': self.y[item]}

    def __len__(self):
        return len(self.X)

In [80]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=TextDataset(X_train, y_train, vocab),
    eval_dataset=TextDataset(X_val, y_val, vocab),
    compute_metrics=compute_metrics,
    callbacks=[callback]
)

In [81]:
trainer.train()

***** Running training *****
  Num examples = 3600
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 10800
  Number of trainable parameters = 108311810


  0%|          | 0/10800 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [49]:
model.forward('Text')

AttributeError: 'str' object has no attribute 'size'

# Make submission

In [None]:
make_submission = False

if make_submission:
    test_ids = read_train_df['TweetID'].iloc[val_indexes]
    # model = xgboost.XGBRegressor(verbosity=1, max_depth=10)
    # model.fit(full_train_X, train_y)
    #
    # test_predictions = model.predict(full_test_X)

    submission_df = pd.DataFrame(data={'TweetID': test_ids, 'retweets_count': val_results})
    submission_df.to_csv('data/val_predictions.csv', index=False)

    print('Saved csv')