In [5]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer


from transformers import *
from tokenizers import BertWordPieceTokenizer
from nltk.tokenize import word_tokenize

import os
import re
import string
import random
import time
import nltk, string
from tqdm import tqdm

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [6]:
# train, test, submission
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [7]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train_df.at[train_df['id'].isin(ids_with_target_error),'target'] = 0
train_df[train_df['id'].isin(ids_with_target_error)]

Unnamed: 0,id,keyword,location,text,target
229,328,annihilated,,Ready to get annihilated for the BUCS game,0
301,443,apocalypse,,Short Reading\n\nApocalypse 21:1023 \n\nIn the spirit the angel took me to the top of an enormous high mountain and... http://t.co/v8AfTD9zeZ,0
356,513,army,Studio,But if you build an army of 100 dogs and their leader is a lion all dogs will fight like a lion.,0
1822,2619,crashed,,My iPod crashed..... \n#WeLoveYouLouis \n#MTVHottest One Direction,0
2536,3640,desolation,"Quilmes , Arg",This desperation dislocation\nSeparation condemnation\nRevelation in temptation\nIsolation desolation\nLet it go and so to find away,0
2715,3900,devastated,PG Chillin!,Man Currensy really be talkin that talk... I'd be more devastated if he had a ghostwriter than anybody else....,0
3024,4342,dust%20storm,chicago,Going to a fest? Bring swimming goggles for the dust storm in the circle pit,0
4068,5781,forest%20fires,,Campsite recommendations \nToilets /shower \nPub \nFires \nNo kids \nPizza shop \nForest \nPretty stream \nNo midges\nNo snakes\nThanks ??,0
4609,6552,injury,Saint Paul,My prediction for the Vikings game this Sunday....dont expect a whole lot. Infact I think Zimmer goal is....injury free 1st game,0
4611,6554,injury,,Dante Exum's knee injury could stem Jazz's hoped-for surge back to ... http://t.co/8PIFutrB5U,0


In [8]:
def clean_tweets(tweet):
    tweet = ''.join([x for x in tweet if x in string.printable])
    
    # 정규식 이용하여 url 제거
    tweet = re.sub(r"http\S+", "", tweet)
    
    return tweet

In [9]:
# train_df text , test_df text
train_df['text'] = train_df['text'].apply(lambda x: clean_tweets(x))
test_df['text'] = test_df['text'].apply(lambda x: clean_tweets(x))

In [10]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [11]:
train_df['text'] = train_df['text'].apply(lambda x: remove_emoji(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_emoji(x))

In [12]:
def remove_punctuations(text):
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    
    for p in punctuations:
        text = text.replace(p, f' {p} ')
        
    text = text.replace('...', ' ... ')
    
    if '...' not in text:
        text = text.replace('..', ' ... ')
        
    return text

In [13]:
train_df['text'] = train_df['text'].apply(lambda x: remove_punctuations(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_punctuations(x))

In [14]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [15]:
def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

In [16]:
def convert_abbrev_in_text(text):
    tokens = word_tokenize(text)
    tokens = [convert_abbrev(word) for word in tokens]
    text = ' '.join(tokens)
    return text

In [17]:
train_df['text'] = train_df['text'].apply(lambda x: convert_abbrev_in_text(x))
test_df['text'] = test_df['text'].apply(lambda x: convert_abbrev_in_text(x))

In [18]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,All residents asked to ' shelter in place ' are being notified by officers . No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive # wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby # Alaska as smoke from # wildfires pours into a school,1


In [19]:
train_df.shape

(7613, 5)

In [20]:
def encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i: i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
        
    return np.array(all_ids)

In [39]:
def build_model(transformer, loss='binary_crossentropy', max_len=512):
    inputs = Input(shape=(max_len,), dtype=tf.int32, name='input_words')
    sequence_output = transformer(inputs)[0]
    
    cls_token = sequence_output[:, 0, :]
    
    x = tf.keras.layers.Dropout(0.35)(cls_token)
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=out)
    model.compile(optimizer='adam', loss=loss, metrics=[tf.keras.metrics.AUC()])
    
    return model
    

In [40]:
AUTO = tf.data.experimental.AUTOTUNE

# TPU 설정 값 세팅
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [41]:
def metric(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, f1

In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [43]:
save_path = '/kaggle/working/distilbert_base_uncased'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

('/kaggle/working/distilbert_base_uncased/vocab.txt',
 '/kaggle/working/distilbert_base_uncased/special_tokens_map.json',
 '/kaggle/working/distilbert_base_uncased/added_tokens.json')

In [44]:
hf_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=False)

In [45]:
train_df, valid_df = train_test_split(train_df, test_size=0.2)

In [46]:
MAX_SEQ_LENGTH = 512
LEARNING_RATE = 3e-5
NUM_EPOCHS = 30

In [47]:
x_train = encode(train_df.text.astype(str), hf_tokenizer, maxlen=MAX_SEQ_LENGTH)
x_valid = encode(valid_df.text.astype(str), hf_tokenizer, maxlen=MAX_SEQ_LENGTH)
x_test = encode(test_df.text.astype(str), hf_tokenizer, maxlen=MAX_SEQ_LENGTH)

100%|██████████| 20/20 [00:00<00:00, 52.38it/s]
100%|██████████| 5/5 [00:00<00:00, 53.37it/s]
100%|██████████| 13/13 [00:00<00:00, 52.36it/s]


In [48]:
y_train = train_df.target.values
y_valid = valid_df.target.values

In [49]:
train_dataset = (tf.data.Dataset.from_tensor_slices((x_train, y_train)) \
    .repeat() \
    .shuffle(2048) \
    .batch(64) \
    .prefetch(AUTO))

In [50]:
valid_dataset = (
    tf.data.Dataset.from_tensor_slices((x_valid, y_valid)) \
    .batch(64) \
    .cache() \
    .prefetch(AUTO)
)

In [51]:
test_dataset = [(
    tf.data.Dataset.from_tensor_slices(x_test)
    .batch(64)
)]

In [52]:
from tensorflow.keras import backend as K

def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - \
            K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    
    return focal_loss_fixed

In [53]:
def build_lrfn(lr_start=0.000001, lr_max=0.000004, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

In [54]:
lrfn = build_lrfn()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

In [55]:
with strategy.scope():
    transformer_layer = TFBertModel.from_pretrained('bert-base-cased')
    model = build_model(transformer_layer, loss=focal_loss(gamma=1.5), max_len=512)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_words (InputLayer)     [(None, 512)]             0         
_________________________________________________________________
tf_bert_model_2 (TFBertModel ((None, 512, 768), (None, 108310272 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dropout_111 (Dropout)        (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 108,311,041
Trainable params: 108,311,041
Non-trainable params: 0
_________________________________________________________________


In [56]:
# fit 
history = model.fit(train_dataset, 
                   steps_per_epoch=10,
                   validation_data=valid_dataset,
                   callbacks=[lr_schedule],
                   epochs=6
)


Epoch 00001: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 1/6

Epoch 00002: LearningRateScheduler reducing learning rate to 5.428571428571429e-06.
Epoch 2/6

Epoch 00003: LearningRateScheduler reducing learning rate to 9.857142857142859e-06.
Epoch 3/6

Epoch 00004: LearningRateScheduler reducing learning rate to 1.4285714285714289e-05.
Epoch 4/6

Epoch 00005: LearningRateScheduler reducing learning rate to 1.8714285714285717e-05.
Epoch 5/6

Epoch 00006: LearningRateScheduler reducing learning rate to 2.3142857142857145e-05.
Epoch 6/6


In [60]:
# submission csv predcit 
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
preds = model.predict(x_test)

In [62]:
preds[:5]

array([[0.21652666],
       [0.21379274],
       [0.20796502],
       [0.21605703],
       [0.22063842]], dtype=float32)

In [63]:
preds[preds >= 0.5] = 1
preds[preds < 0.5] = 0

In [64]:
# submission 'target'
sample['target'] = preds

In [None]:
# to_csv
sample.to_csv('result.csv', index=False)