In [35]:
import pandas as pd
import operator
from tqdm import tqdm
import pickle

tqdm.pandas()
df = pd.read_csv('../data/raw/nlp_with_disaster_tweets/train.csv')
# df = pd.read_csv('../data/processed/nlp_with_disaster_tweets/train_with_splits.csv')

df.head()

Unnamed: 0,id,keyword,location,split,target,text
0,1838,burned,Canada,train,0,sure take them away from fire fighting for kin...
1,9020,stretcher,Docker container,train,0,stretcher in min speaker deck http t . co fbl...
2,3940,devastated,"Chicago, IL",train,0,keegan i m devastated
3,6330,hostage,"Victorville, CA",train,0,wut a lonely lunch . i got ditched . and i m h...
4,5180,fatalities,,train,0,i wonder how cool weird it ll look to have all...


In [2]:
## https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part1-eda

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [3]:
def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []

    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [4]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [5]:
#https://www.kaggle.com/robertyoung/glove-twitter-pickles-27b-25d-50d-100d-200d
GLOVE_EMBEDDING_PATH='../data/external/glove.twitter.27B.25d.pkl'
glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH)

In [6]:
len(glove_embeddings)


1193514

In [7]:
vocab = build_vocab(list(df['text'].apply(lambda x: x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]

100%|██████████| 7613/7613 [00:00<00:00, 228003.72it/s]
100%|██████████| 31924/31924 [00:00<00:00, 922976.44it/s]

Found embeddings for 23.21% of vocab
Found embeddings for  55.58% of all text





[('I', 1197),
 ('The', 552),
 ('A', 290),
 ("I'm", 225),
 ('??', 214),
 ('In', 155),
 ('...', 147),
 ('2', 145),
 ("don't", 128),
 ("it's", 115)]

In [8]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
white_list = string.ascii_letters + string.digits + latin_similar + ' '
white_list += "'"

In [9]:
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
glove_symbols

100%|██████████| 1193514/1193514 [00:00<00:00, 2895521.82it/s]


'.:,!"?()！。-、…/*>^？<・&♥“”_♡´،~;（･|[）]—笑ω❤～★`$و♪=｀¿+☆☺ﾟ▽%✔@؟•█｡в\\；¡＾／→∀：＼°и←⌣#я}{░д✌˘¬━–сﾉه⭕»☀あ☹＿月✅➊＞«⌒＜●∇а➌➋え♦➍♫─○ノ✋‾▀日▄؛═，➎́๑┓↘✨✈⇒┃ㅋу̀ヽ∩×≦≧ーب↙‹┌ε▒＆＠☝║☯―σ┛▿円ｗ┐£о›｣≡んي┗位ƪ▓┏｢☑►ʃ┈فⓜ╗↓っم❄♬年̷ع時╯三＝╰､※╔·は■％｜◎ˆ∠＊☞⊂✰к아お人◦╮◠€̮‿▂з╚➡と╭⊃╝ゝ◡˙º□❌└©↑☜で⊙．艸¯◇╩̩➏¼＃╦ヾ☻∧▼－回ㅠ◆╱✿いۈ이➒◔♻✧◞그＋◉˚∂일수╬╲のつ̯◟╹لｏ⛄더ิ난┊第が分²▬➐✊❗다왜내⁰∞나ا￥┻なね⁾ง®ˇ☁ⓔ泣¸┘╥ゞ͡╠ﷺ✓①˛¨−잘월か⚪⚡②◕ｰ┳щ۳私う좀［を］▇ﻻ☔▸╣³⚽̶⭐̅՞ま♩✳今❀౪◄ⓣฺु△ۆ∵♂한◾۶▅√θ†▔◯すⓞ✖⚾′俺™土것੭안☼❛③ηツ▲❁℃ㅎ✩저시ο헐또전➜♔┣◝٩にｔ彡▶歳◜金▐년や✗♛へᴗ♣◍☕だ▕ª͟し⚫¥❥네わ¤제‸뭐ㅇⓡ⇦╋ヮ本할ふ│④중❶пこ̥ⓦ♠ⓛ¦▾❷そ❸た☎‵▉ロ枚┫✴☉卍سも꼭さ분ฅг水⇢⁽ﷲ▌때ㅁ▏⇠よ✪응٣ξ❹음ك┼ρ♉와話◢نㅅدｉ❒☂汗ｪ爆☛참て₍٢∪┎⇄١의◤خू너ｖॢ⚠♕ً♏날٥１рェ가ๆ❕거м≫ザ⇨ـ§는木⌓ﺑ어건„✉⠀嵐↗ッ✽ɔ에위ㅂ火ⓕღ☃ຶ☐✂⑤͈를ʔਊ∴을◒く하るㅜ♈막∑⋆₎◀오후男◣♀┒≪３点万게▷명丿ψㅤ涙２▪е못ت̫ミ٤ืأ번ლⓨき中ν約女ⓐらжれ⛅해ةх등말ア̵원̸┉❣ム♯̃◐➫개।◥号ัンむ№∫⑥대０✦бص½イ±✡ﻭ倍장僕≖個φｑ何ⓟ巻κ۞づみ˖͒ﾝʘقι⊖ⓘ▆นぁ₃넘ﻟ夜α΄♚ち넌⬅고큰ٰ⬇야◑ʕ¢ど嘘り⇩ⓤ♧壁ر″ﻵ✏仮๏걸로н≈̴될ِ∈件☇두̾黒만มｙぇ자ｂ猫ถｐ은♍‚⑅ٍ度秒ё٪❓照첫줄حཀﾍ部∗➙全✹걍ⓒﾛ♌▃ヘａ☣求朝ԅ祝⌯☮母➬헉高☾☋赤앗⇆т곧뭘↖皿ㅡ본유名살딱ラほ회밤譲♨і二δ∋̣лَζ番◼め工他♓جｃ̪愛ڡワ및듯フч볼␞エ＂눈➑차새ひ̐⛳じメش집☚✮✘ⓑｕ一ス恋◻↔ﾞ弾된٧後٦昼ⓚ♋暇\u06dd⬛⇓⇛４⚓늘禁인도총돈５세誰代神✍ⓗ✞ご┬올怒ク님せҧ⬆ⓢวｍ¶＇니青형÷凸널̿けณハレй☄⛔ۚℓオ짱✯色小ⓝ으π➰٠ํべ妹ゆ➪╙⑦⏰온昔ドэ新著กິル夢ﾐ父\x80◓ｱอ無犬٨✝リ❖棒۔死ﯙ✫↪ろﻓ∥❝ھ̆ⓙآ┴大μ글ｘ화楽ُː➔╖몇̲白▁ㄷஇ⁼⌑✤勝ʅｴ✲ｧ☪

In [10]:
jigsaw_chars = build_vocab(list(df["text"]))
jigsaw_symbols = ''.join([c for c in jigsaw_chars if not c in white_list])
jigsaw_symbols


100%|██████████| 7613/7613 [00:00<00:00, 93259.02it/s]


'#.,=>-?!;):@/\x89\n_&([]|*¢$\x9dª+÷%~£¤}´^¨©«\\¬¼¡`{'

In [11]:
symbols_to_delete = ''.join([c for c in jigsaw_symbols if not c in glove_symbols])
symbols_to_delete

'\x89\n\x9d'

In [12]:
symbols_to_isolate = ''.join([c for c in jigsaw_symbols if c in glove_symbols])
symbols_to_isolate


'#.,=>-?!;):@/_&([]|*¢$ª+÷%~£¤}´^¨©«\\¬¼¡`{'

In [13]:
isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x


In [14]:
df['text'] = df['text'].progress_apply(lambda x:handle_punctuation(x))
# test['comment_text'] = test['comment_text'].progress_apply(lambda x:handle_punctuation(x))


100%|██████████| 7613/7613 [00:00<00:00, 82837.37it/s]


In [15]:
vocab = build_vocab(list(df['text'].apply(lambda x: x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]

100%|██████████| 7613/7613 [00:00<00:00, 192374.23it/s]
100%|██████████| 27118/27118 [00:00<00:00, 931083.30it/s]

Found embeddings for 31.17% of vocab
Found embeddings for  74.26% of all text





[('I', 1238),
 ('The', 554),
 ('A', 310),
 ("I'm", 228),
 ('2', 215),
 ('In', 160),
 ('3', 135),
 ('Û', 133),
 ("don't", 130),
 ("it's", 115)]

In [16]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [17]:
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

In [18]:
df['text'] = df['text'].progress_apply(lambda x:handle_contractions(x))
# test['comment_text'] = test['comment_text'].progress_apply(lambda x:handle_contractions(x))

100%|██████████| 7613/7613 [00:00<00:00, 9008.81it/s]


In [37]:
vocab = build_vocab(list(df['text'].apply(lambda x: x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]

100%|██████████| 7613/7613 [00:00<00:00, 195445.11it/s]
100%|██████████| 22131/22131 [00:00<00:00, 864759.43it/s]

Found embeddings for 64.12% of vocab
Found embeddings for  94.08% of all text





[('bioterror', 33),
 ('prebreak', 30),
 ('bioterrorism', 28),
 ('soudelor', 26),
 ('bestnaijamade', 24),
 ('disea', 19),
 ('funtenna', 17),
 ('crematoria', 15),
 ('udhampur', 13),
 ('spos', 9)]

In [23]:
df['lowered_text'] = df['text'].apply(lambda x: x.lower())

In [36]:
vocab = build_vocab(list(df['lowered_text'].apply(lambda x: x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]

KeyError: 'lowered_text'

In [26]:
len(oov)

9760

In [27]:
len(vocab)

22414

In [34]:
glove_embeddings['com']

array([ 1.5951  , -0.77416 ,  1.579   ,  0.1331  ,  0.82548 ,  0.08094 ,
       -0.64546 ,  0.2994  ,  0.93722 , -1.337   , -1.4796  , -2.9713  ,
       -2.5895  , -1.4452  , -0.62077 , -1.4272  , -2.2637  , -0.076898,
       -0.99475 , -0.56269 , -0.66014 ,  0.38594 ,  0.22842 ,  0.38159 ,
        0.13598 ], dtype=float32)