<a href="https://colab.research.google.com/github/aliebi/ELMO/blob/master/ELMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  ELMo for Text Classification in Python

In [0]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [2]:
# read data
train = pd.read_csv("/train_2kmZucJ.csv")
test = pd.read_csv("/test_oJQbWVk.csv")

train.shape, test.shape

((7920, 3), (1953, 2))

In [3]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

# preprocces data

In [0]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [0]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))


In [0]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [0]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [8]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
7663,7664,0,Happy Birthday ANJI ... #birthday #friend #wow #Samsung #Galaxy #S7Edge #positive #vibe… https://www.instagram.com/p/Bcc41Z9nTKX/,happy birthday anji ... birthday friend wow samsung galaxy s edge positive vibe …
1275,1276,0,SAVE $268 Notebook by #Samsung http://www.amazon.com/gp/product/B00AWEZ11M?ie=UTF8&camp=213733&creative=393185&creativeASIN=B00AWEZ11M&linkCode=shr&tag=httpstwitter.comlindsaytweets1-20 … I #Faceb...,save notebook by samsung … i facebook computer laptop technology netbook shop xxoo m
2119,2120,0,@BOy_OwL I know right !? Who wouldn't $&@*# jk I miss you too #facetime #iphone,boyowl i know right who would not jk i miss -PRON- too facetime iphone
1636,1637,1,So my laptop upgraded me to a newer version without telling me and now I can't play League of Legends #startingto #comeonapple,so -PRON- laptop upgrade -PRON- to a new version without tell -PRON- and now i can not play league of legend startingto comeonapple
6741,6742,1,"@AppleSupport is a freaking joke, lock me out of iCloud, 13 days to reset password to access account, nothing sent, redo account recovery, now it’s 10 days. I’ve lost everything, including busines...","applesupport be a freaking joke , lock -PRON- out of icloud , day to reset password to access account , nothing send , redo account recovery , now -PRON- ’s day . -PRON- have lose everything , inc..."
2394,2395,0,Ummm yea 7% battery loss in 8hrs... can I get a $&@*# yea?! Sansung,ummm yea battery loss in hrs ... can i get a yea sansung
3814,3815,1,just lost all contacts and photos on my phone. Thank god Steve Jobs is dead or else I would go and murder him #neednumbers NOWWW!,just lose all contact and photo on -PRON- phone . thank god steve job be dead or else i would go and murder -PRON- neednumber nowww
7165,7166,0,Eat #apple #bigapple #eat #funny @ Veithani Hospitsl@ Lad Prao http://instagram.com/p/hSvQuwmVP0/,eat apple bigapple eat funny veithani hospitsl lad prao
513,514,0,Amazon Prime Day 2018: Everything you need to know https://www.cnet.com/how-to/amazon-prime-day-2018-everything-you-need-to-know/ … #Amazon #amazonprme #sale #Deals #shop #onlineshopping #discount...,amazon prime day everything -PRON- need to know … amazon amazonprme sale deal shop onlineshoppe discount free buy ad gift birthday family news tech iphone holiday christmas like like viral girl mo...
7388,7389,1,I actually hate the iOS 6. It destroyed my phone. #sofrustrating apple #smd,i actually hate the io . -PRON- destroy -PRON- phone . sofrustrate apple smd


# make model by EMLO

In [10]:
!pip install "tensorflow>=1.7.0"
!pip install tensorflow-hub



now we should download pretrained elmo

In [0]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

try by a sample

In [13]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

!!!! we have one sentence so dimension one is 1
!!!! our sentence length max is 8
!!!! and 1024 is the length of elmo featurs

In [0]:

def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

our data is huge so divide it to batches are small

In [0]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]


In [0]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test  = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [0]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [0]:
# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [0]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

# Model Building and Evaluation

In [0]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
preds_valid = lreg.predict(xvalid)

In [27]:
f1_score(yvalid, preds_valid)

0.7761904761904763

In [0]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [0]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)