In [1]:
import os
# DECLARE HOW MANY GPUS YOU WISH TO USE. 
# KAGGLE ONLY HAS 1, BUT OFFLINE, YOU CAN USE MORE
os.environ["CUDA_VISIBLE_DEVICES"]="0" #0,1,2,3 for four gpu

# VERSION FOR SAVING/LOADING MODEL WEIGHTS
# THIS SHOULD MATCH THE MODEL IN LOAD_MODEL_FROM
VER=14 



# IF VARIABLE IS NONE, THEN NOTEBOOK COMPUTES TOKENS
# OTHERWISE NOTEBOOK LOADS TOKENS FROM PATH
LOAD_TOKENS_FROM = '../input/tf-longformer-v12'

# IF VARIABLE IS NONE, THEN NOTEBOOK TRAINS A NEW MODEL
# OTHERWISE IT LOADS YOUR PREVIOUSLY TRAINED MODEL
LOAD_MODEL_FROM = None #'../input/tflongformerv14'

# IF FOLLOWING IS NONE, THEN NOTEBOOK 
# USES INTERNET AND DOWNLOADS HUGGINGFACE 
# CONFIG, TOKENIZER, AND MODEL
DOWNLOADED_MODEL_PATH = '../input/tf-longformer-v12'

if DOWNLOADED_MODEL_PATH is None:
    DOWNLOADED_MODEL_PATH = 'model'    
MODEL_NAME = 'allenai/longformer-base-4096'

In [2]:
from datasets import get_dataset_config_names, load_dataset

In [3]:
subjqa = load_dataset("subjqa", name="electronics")

Downloading:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading and preparing dataset subjqa/electronics (download: 10.86 MiB, generated: 3.01 MiB, post-processed: Unknown size, total: 13.86 MiB) to /root/.cache/huggingface/datasets/subjqa/electronics/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd...


Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset subjqa downloaded and prepared to /root/.cache/huggingface/datasets/subjqa/electronics/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
subjqa.set_format(type='pandas')

In [5]:
qadf = subjqa['train'][:]

In [6]:
qadf.head(3)

Unnamed: 0,domain,nn_mod,nn_asp,query_mod,query_asp,q_reviews_id,question_subj_level,ques_subj_score,is_ques_subjective,review_id,id,title,context,question,answers
0,electronics,great,bass response,excellent,bass,0514ee34b672623dff659334a25b599b,5,0.5,False,882b1e2745a4779c8f17b3d4406b91c7,2543d296da9766d8d17d040ecc781699,B00001P4ZH,"I have had Koss headphones in the past, Pro 4A...",How is the bass?,"{'text': [], 'answer_start': [], 'answer_subj_..."
1,electronics,harsh,high,not strong,bass,7c46670208f7bf5497480fbdbb44561a,1,0.5,False,ce76793f036494eabe07b33a9a67288a,d476830bf9282e2b9033e2bb44bbb995,B00001P4ZH,To anyone who hasn't tried all the various typ...,Is this music song have a goo bass?,"{'text': ['Bass is weak as expected', 'Bass is..."
2,electronics,neutral,sound,present,bass,8fbf26792c438aa83178c2d507af5d77,1,0.5,False,d040f2713caa2aff0ce95affb40e12c2,455575557886d6dfeea5aa19577e5de4,B00001P4ZH,I have had many sub-$100 headphones from $5 Pa...,How is the bass?,{'text': ['The only fault in the sound is the ...


In [7]:
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}

for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

Number of questions in train: 1295
Number of questions in test: 358
Number of questions in validation: 255


In [8]:
qa_cols = ["title", "question", "answers.text",
           "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

Unnamed: 0,title,question,answers.text,answers.answer_start,context
791,B005DKZTMG,Does the keyboard lightweight?,[this keyboard is compact],[215],I really like this keyboard. I give it 4 star...
1159,B00AAIPT76,How is the battery?,[],[],I bought this after the first spare gopro batt...


In [9]:
dfs['train']['answers.text'][5]

array(['that had terrible bass',
       'This is my first review for the Sony MDR'], dtype=object)

# How To Submit TensorFlow Without Internet
Many people ask me, how do I submit TensorFlow models without internet? With HuggingFace Transformer, it's easy. Just download the following 3 things (1) model weights, (2) tokenizer files, (3) config file, and upload them to a Kaggle dataset. Below shows code how to get the files from HuggingFace for AllenAI's model `longformer-base`. But this same code can download any transformer, like for example `roberta-base`.

In [10]:
if DOWNLOADED_MODEL_PATH == 'model':
    os.mkdir('model')
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained('model')

    config = AutoConfig.from_pretrained(MODEL_NAME) 
    config.save_pretrained('model')

    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    backbone.save_pretrained('model')

The above saves the files
* TOKENIZER FILES - merges.txt, tokenizer_config.json, special_tokens_map.json, tokenizer.json, vocab.json
* CONFIG FILE - config.json
* MODEL WEIGHT FILE - tf_model.h5

Then just upload all these files to a Kaggle dataset, like what I did [here][1]. Then you load them into your notebook like the notebook you are reading. And we can turn internet off!

[1]: https://www.kaggle.com/cdeotte/tf-longformer-v12

# Load Libraries

In [11]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import *
print('TF version',tf.__version__)

TF version 2.6.2


In [12]:
# USE MULTIPLE GPUS
if os.environ["CUDA_VISIBLE_DEVICES"].count(',') == 0:
    strategy = tf.distribute.get_strategy()
    print('single strategy')
else:
    strategy = tf.distribute.MirroredStrategy()
    print('multiple strategy')

single strategy


In [13]:
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

Mixed precision enabled


# Load Train

In [14]:
train = pd.read_csv('../input/feedback-prize-2021/train.csv')
print( train.shape )
train.head()

(144293, 8)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [15]:
qa_cols = ["id", "question", "answers.text","answers.answer_start", "answers.answer_end", "context"]
train_df = pd.DataFrame(columns = qa_cols)

In [16]:
train_df

Unnamed: 0,id,question,answers.text,answers.answer_start,answers.answer_end,context


In [17]:
labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
             'Counterclaim', 'Rebuttal']

In [18]:
IDS = train.id.unique()
print('There are',len(IDS),'train texts.')

There are 15594 train texts.


In [19]:
df = train.loc[train.id==IDS[0]]
df[df.discourse_type == labels[0]]

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...


In [20]:
MAX_LEN = 512

model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# THE TOKENS AND ATTENTION ARRAYS
#tokenizer = AutoTokenizer.from_pretrained(DOWNLOADED_MODEL_PATH)
train_tokens = np.zeros((len(IDS),MAX_LEN), dtype='int32')
train_attention = np.zeros((len(IDS),MAX_LEN), dtype='int32')

# # THE 14 CLASSES FOR NER
# lead_b = np.zeros((len(IDS),MAX_LEN))
# lead_i = np.zeros((len(IDS),MAX_LEN))

# position_b = np.zeros((len(IDS),MAX_LEN))
# position_i = np.zeros((len(IDS),MAX_LEN))

# evidence_b = np.zeros((len(IDS),MAX_LEN))
# evidence_i = np.zeros((len(IDS),MAX_LEN))

# claim_b = np.zeros((len(IDS),MAX_LEN))
# claim_i = np.zeros((len(IDS),MAX_LEN))

# conclusion_b = np.zeros((len(IDS),MAX_LEN))
# conclusion_i = np.zeros((len(IDS),MAX_LEN))

# counterclaim_b = np.zeros((len(IDS),MAX_LEN))
# counterclaim_i = np.zeros((len(IDS),MAX_LEN))

# rebuttal_b = np.zeros((len(IDS),MAX_LEN))
# rebuttal_i = np.zeros((len(IDS),MAX_LEN))

# # HELPER VARIABLES
# train_lens = []
# targets_b = [lead_b, position_b, evidence_b, claim_b, conclusion_b, counterclaim_b, rebuttal_b]
# targets_i = [lead_i, position_i, evidence_i, claim_i, conclusion_i, counterclaim_i, rebuttal_i]
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}

Downloading:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [21]:
# FOR LOOP THROUGH EACH TRAIN TEXT
for id_num in range(len(IDS)):
    #if LOAD_TOKENS_FROM: break
    if id_num % 500 == 0: print(id_num,', ',end='')
        
    # READ TRAIN TEXT, TOKENIZE, AND SAVE IN TOKEN ARRAYS    
    n = IDS[id_num]
    name = f'../input/feedback-prize-2021/train/{n}.txt'
    txt = open(name, 'r').read()
    #keep track of the discourse text length
    #train_lens.append( len(txt.split()))
    tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                   truncation=True, return_offsets_mapping=True)
    train_tokens[id_num,] = tokens['input_ids']
    train_attention[id_num,] = tokens['attention_mask']
    
    # FIND TARGETS IN TEXT AND SAVE IN TARGET ARRAYS
    offsets = tokens['offset_mapping']
    offset_index = 0
    df = train.loc[train.id==n]
    for label in labels:
        label_df = df[df.discourse_type == label]
        answer_start = []
        answer_end = []
        answer_text = []
        for index,row in label_df.iterrows():
            start = row.discourse_start
            end = row.discourse_end
            answer_start.append(start)
            answer_end.append(end)
            answer_text.append(row.discourse_text)
            #print(row)
        qa_cols = ["id", "question", "answers.text","answers.answer_start", "answers.answer_end", "context"]
        item = {'id': n, 'question': label,
                "answers.text":answer_text,
                "answers.answer_start": answer_start,
                "answers.answer_end": answer_end,
                "context":txt}
        tmp_df = pd.DataFrame.from_dict(item)
        train_df = pd.concat([train_df, tmp_df])

0 , 500 , 1000 , 1500 , 2000 , 2500 , 3000 , 3500 , 4000 , 4500 , 5000 , 5500 , 6000 , 6500 , 7000 , 7500 , 8000 , 8500 , 9000 , 9500 , 10000 , 10500 , 11000 , 11500 , 12000 , 12500 , 13000 , 13500 , 14000 , 14500 , 15000 , 15500 , 

In [22]:
train_df.head()

Unnamed: 0,id,question,answers.text,answers.answer_start,answers.answer_end,context
0,423A1CA112E2,Lead,Modern humans today are always on their phone....,8.0,229.0,Phones\n\nModern humans today are always on th...
0,423A1CA112E2,Position,They are some really bad consequences when stu...,230.0,312.0,Phones\n\nModern humans today are always on th...
0,423A1CA112E2,Evidence,Some certain areas in the United States ban ph...,313.0,401.0,Phones\n\nModern humans today are always on th...
1,423A1CA112E2,Evidence,"When people have phones, they know about certa...",402.0,758.0,Phones\n\nModern humans today are always on th...
2,423A1CA112E2,Evidence,That's why there's a thing that's called no te...,887.0,1150.0,Phones\n\nModern humans today are always on th...


In [23]:
train_df.to_csv("qatrain_df.csv", index = False)