In [8]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

# read data
train = pd.read_csv("https://datahack-prod.s3.amazonaws.com/train_file/train_2kmZucJ.csv")
test = pd.read_csv("https://datahack-prod.s3.amazonaws.com/test_file/test_oJQbWVk.csv")

train.shape, test.shape


((7920, 3), (1953, 2))

In [9]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [0]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [0]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [0]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [0]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [0]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
5387,5388,1,"lock button on my phone broke trying to screen shot fitspo and workouts, ya don't know what ya got till its gone! iphone","lock button on my phone broke trying to screen shot fitspo and workouts, ya don't know what ya got till its gone iphone"
4014,4015,0,Hey Guys! Look Apple iPhone X 64GB Silver Unlocked Smartphone http://zpr.io/6EdnM #money #today #life #amazon #retweet #twitter #birthday #sale #android #windows #nokia #samsung #iphone #phone #sm...,hey guys look apple iphone x gb silver unlocked smartphone money today life amazon retweet twitter birthday sale android windows nokia samsung iphone phone smartphone motorola offers bestprice jul...
6702,6703,0,Change the background image https://goo.gl/HECCAH #android #apps #beautiful #cute #BBB16 #igers #iphone #iphone #halamadride #FCB #h,change the background image android apps beautiful cute bbb igers iphone iphone halamadride fcb h
3234,3235,0,@sethflute @gentlemenhall #sailintothesun #didyouknow? #samsung #camera #commercial 2 million plus plays #awesome,sethflute gentlemenhall sailintothesun didyouknow samsung camera commercial million plus plays awesome
4197,4198,0,Sweet Love With Sweet App 360 #idol #girl #nowplaying quote #iphone #valentineapp 563 https://itunes.apple.com/us/app/love360/id809353957?mt=8 …,sweet love with sweet app idol girl nowplaying quote iphone valentineapp …
6360,6361,0,casper #lol #look #lotd #pink #aj #cap #iphone #selfie #igers #instalike #instalove #local… https://instagram.com/p/5LiGZTOlbq/,casper lol look lotd pink aj cap iphone selfie igers instalike instalove local…
7187,7188,1,Half and hour trying to reset and $&@*# password only to be locked out,half and hour trying to reset and password only to be locked out
4405,4406,0,I beat a personal record today on the bike! #vsco #vscocam #greatoutdoors #beautiful #statenisland #nyc #ny #nj #iphonex #shotoniphone #sky #skyporn #nature #july #summer… https://www.instagram.co...,i beat a personal record today on the bike vsco vscocam greatoutdoors beautiful statenisland nyc ny nj iphonex shotoniphone sky skyporn nature july summer… …
4253,4254,1,"Warning! iPhone bug causing trouble with numbers. Deleting, restoring etc. no update available. #apple #rightnow","warning iphone bug causing trouble with numbers. deleting, restoring etc. no update available. apple rightnow"
226,227,0,I like iphone only because I get to play games lol I love 'angry birds' #iphone #gam http://www.linkati.com/q/index.php?i=666227,i like iphone only because i get to play games lol i love 'angry birds' iphone gam


In [17]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

W0317 06:19:31.892063 140035101722496 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
W0317 06:19:37.342249 140035101722496 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [18]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [0]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [0]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [0]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [0]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [0]:
# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [0]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [0]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new,train['label'],random_state=42,test_size=0.2)
                                                   
                                                  

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
preds_valid = lreg.predict(xvalid)

In [28]:
f1_score(yvalid, preds_valid)

0.7727272727272728

In [0]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [0]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)