In [1]:
import os
import tqdm
import re

import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import decomposition
import lightgbm as lgb
import xgboost as xgb


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.callbacks import *

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

import geopandas as gpd
from geotext import GeoText

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

import gc
gc.collect()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train.shape,test.shape

((7613, 5), (3263, 4))

In [6]:
print(test.isna().sum())
print(train.isna().sum())

id             0
keyword       26
location    1105
text           0
dtype: int64
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [7]:
train[~train['keyword'].isna()][:10]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
36,54,ablaze,Pretoria,@PhDSquares #mufc they've built so much hype a...,0
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1
38,56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1
39,57,ablaze,Paranaque City,Ablaze for you Lord :D,0
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0


In [8]:
def hashtag(text):
    text = text.lower()
    t = re.findall(r'#[a-z]+',text)
    t = " ".join(t)
    t = re.sub(r'#','',t)
    return t
    
# def cities(text):
#     text= text.lower()
# #     text = re.sub(r'#',' ',text)
# #     t = "".join(text)
#     places= GeoText(text)
#     cities = list(places.cities)+list(places.countries)
#     return cities
    
train['hash_words'] = train['text'].apply(lambda x: hashtag(x))
#train['cities'] = train['text'].apply(lambda x: cities(x))



In [9]:
# global hash_target_1,hash_target_0
# hash_target_1 = ' '
# for t in train[train['target']==1].hash_words:
#     hash_target_1+=t+' '
# hash_target_1 =[t for t in hash_target_1.split()]

# hash_target_0 = ' '
# for t in train[train['target']==0].hash_words:
#     hash_target_0+=t+' '
# hash_target_0 =[t for t in hash_target_0.split()]



# def all_hash_1(text):
#     text = text.lower()
#     text = [t for t in text.split()]
#     t = [t for t in text if t in hash_target_1]
#     return " ".join(t)
# def all_hash_0(text):
#     text = text.lower()
#     text = [t for t in text.split()]
#     t = [t for t in text if t in hash_target_0]
#     return " ".join(t)

# train['all_hash_1']= train['text'].apply(lambda x: all_hash_1(x))
# train['all_hash_0']= train['text'].apply(lambda x: all_hash_0(x))

In [10]:
target_1=""
for t in train[train['target']==1].hash_words:
    target_1+=t+' '

target_0=""
for t in train[train['target']==0].hash_words:
    target_0+=t+' '

In [11]:
target_1[:1000]

'earthquake   wildfires alaska wildfires rockyfire cafire wildfires flood disaster     flooding raining flooding florida tampabay tampa flood we breaking  africanbaze  bridgetown     kurds diyala  california climate energy   nashvilletraffic santaclara bayarea traffic        truckcrash fortworth ashville traffic    manchester traffic       breaking hagerstown whag  bahrain news        horrible accident watchthevideo   kca votejkt rip binladen mlb man airport airplane aircraft aeroplane runway accident freaky    crash aircraft airplane pilot death accident carfest       omg rip airplane accident jetengine turbojet boing g      rodkiai  emsne yugvani    news til dna      reuters    worldnews worldnews                   gilbert     internetradio collegeradi     storm apocalypse   pbban   armageddon brics roberts russia   directioners      newsintweets       kisii countynews    kisii countynews   lgbt  lesbian       arsonist headlines nightbeat sanfrancisco newyork  nativehuman myreligion 

In [12]:
target_0[:1000]

'                metal rt   mufc  nsfw       nsfw         nowplaying edm   personalinjury solicitor otleyhour stlouis caraccidentlawyer        arrestpastornganga   dubstep trapmusic dnb edm dance ices  dubstep trapmusic dnb edm dance ices    growingupspoiled    dubstep trapmusic dnb edm dance ices dubstep trapmusic dnb edm dance ices dubstep trapmusic dnb edm dance ices  dubstep trapmusic dnb edm dance ices dubstep trapmusic dnb edm dance ices wisdomwed lifehacks  silverwood aftershock  book   now wdyouth biblestudy                 justsaying randomthought     ems paramedics ambulance            mets              fantasticfour fant      lgm            gilbert gilbert az wildhorses tantonationalforest   saltriverwildhorses     sciencefiction     warmbodies            etcpb    pbban pbban doublecups armageddon      love truelove romance voodoo seduction astrology rtrrt lotz apocalypse armageddon  eonlinechat   love truelove romance voodoo seduction astrology rtrrt lotz apocalypse armaged

In [13]:
def text_clean(text):
    text = text.lower()
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)
    text = re.sub('[~!@#$\'%&*()-?=]+','',text)
    text = [t for t in text.split() if len(t)>2]
    text= " ".join(text)
    return text

In [14]:
train["text_clean"] = train['text'].apply(lambda x: text_clean(x))
test["text_clean"] = test['text'].apply(lambda x: text_clean(x))

train.drop(['text'],axis=1,inplace=True)
test.drop(['text'],axis=1,inplace=True)

In [15]:
train.head()

Unnamed: 0,id,keyword,location,target,hash_words,text_clean
0,1,,,1,earthquake,our deeds are the reason this earthquake may a...
1,4,,,1,,forest fire near ronge sask canada
2,5,,,1,,all residents asked shelter place are being no...
3,6,,,1,wildfires,people receive wildfires evacuation orders cal...
4,7,,,1,alaska wildfires,just got sent this photo from ruby alaska smok...


In [16]:
X = train.text_clean
y=train.target
xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)





tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), stop_words = 'english')


#train and validation for TF-IDF
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
#train and validation for Count Vectorizer
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [17]:
# Logistic Regression using TF-IDF as input vector
clf = LogisticRegression(C=1)
clf.fit(xtrain_tfv,ytrain)
score = accuracy_score(yvalid,clf.predict(xvalid_tfv))
print(round(score,3),end=",")

0.808,

In [18]:
# Logistic Regression using count_vectorizer as input vector
clf = LogisticRegression(C=1)
clf.fit(xtrain_ctv,ytrain)
score = accuracy_score(yvalid,clf.predict(xvalid_ctv))
print(round(score,3),end=",")

0.804,

In [None]:
#Light GBM Classifier  using CountVectorizer ngram(1,2)
clf = lgb.LGBMClassifier(max_depth=7, n_estimators=1000, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv,ytrain)
score = accuracy_score(yvalid,clf.predict(xvalid_ctv))
print(round(score,3),end=",")

In [None]:
# # Logistic Regression with skfold using TF-IDF ngram(1,2)
# skf = StratifiedKFold(n_splits=10)
# pred_test =np.zeros((len(test),1))
# cv_score=[]
# for train_index,test_index in skf.split(X,y):
#     xtrain,xval = X.iloc[train_index],X.iloc[test_index]
#     ytrain,yval = y.iloc[train_index],y.iloc[test_index]
    
#     tfv.fit(list(xtrain) + list(xval)+list(test.text_clean))
#     xtrain_tfv =  tfv.transform(xtrain) 
#     xval_tfv = tfv.transform(xval)
#     test_tfv = tfv.transform(test.text_clean)
    
    
#     clf = LogisticRegression(C=1)
#     clf.fit(xtrain_tfv,ytrain)
#     score = accuracy_score(yval,clf.predict(xval_tfv))
#     cv_score.append(score)
#     print(round(score,3),end=",")

#     pred_test += clf.predict_proba(test_tfv)[:,1].reshape(-1,1)
# pred_test=pred_test/10 
# print("\n",round(np.mean(cv_score),3))
# test['lr_tfv']=pred_test
# del(xtrain_tfv,xval_tfv,test_tfv,xtrain,ytrain,xval,yval)

In [None]:
# # Logistic Regression with skfold using count_vectorizer n_gram(1,2)
# skf = StratifiedKFold(n_splits=10)
# pred_test =np.zeros((len(test),1))
# cv_score=[]
# for train_index,test_index in skf.split(X,y):
#     xtrain,xval = X.iloc[train_index],X.iloc[test_index]
#     ytrain,yval = y.iloc[train_index],y.iloc[test_index]
    
#     ctv.fit(list(xtrain) + list(xval)+list(test.text_clean))
#     xtrain_ctv =  ctv.transform(xtrain) 
#     xval_ctv = ctv.transform(xval)
#     test_ctv = ctv.transform(test.text_clean)
    
    
#     clf = LogisticRegression(C=1)
#     clf.fit(xtrain_ctv,ytrain)
#     score = accuracy_score(yval,clf.predict(xval_ctv))
#     cv_score.append(score)
#     print(round(score,3),end=",")

#     pred_test += clf.predict_proba(test_ctv)[:,1].reshape(-1,1)
# pred_test=pred_test/10 
# print("\n",round(np.mean(cv_score),3))
# test['lr_ctv']=pred_test
# del(xtrain_ctv,xval_ctv,test_ctv,xtrain,ytrain,xval,yval)

In [None]:
# #Light GBM Classifier with skfold using TF-IDF ngram(1,2)
# clf = lgb.LGBMClassifier(max_depth=7, n_estimators=1000, colsample_bytree=0.8, 
#                         subsample=0.8, nthread=10, learning_rate=0.1)

# skf = StratifiedKFold(n_splits=10)
# pred_test =np.zeros((len(test),1))
# cv_score=[]
# for train_index,test_index in skf.split(X,y):
#     xtrain,xval = X.iloc[train_index],X.iloc[test_index]
#     ytrain,yval = y.iloc[train_index],y.iloc[test_index]
    
#     tfv.fit(list(xtrain) + list(xval)+list(test.text_clean))
#     xtrain_tfv =  tfv.transform(xtrain) 
#     xval_tfv = tfv.transform(xval)
#     test_tfv = tfv.transform(test.text_clean)
    
#     clf.fit(xtrain_tfv.tocsc(),ytrain)
#     score = accuracy_score(yval,clf.predict(xval_tfv.tocsc()))
#     cv_score.append(score)
#     print(round(score,3),end=",")

#     pred_test += clf.predict_proba(test_tfv.tocsc())[:,1].reshape(-1,1)
# pred_test=pred_test/10 
# print("\n",round(np.mean(cv_score),3))
# test['lgbm_tfv']=pred_test
# del(xtrain_tfv,xval_tfv,test_tfv,xtrain,ytrain,xval,yval)

In [None]:
# #Light GBM Classifier with skfold using count vectorizer ngram(1,2)
# clf = lgb.LGBMClassifier(max_depth=7, n_estimators=1000, colsample_bytree=0.8, 
#                         subsample=0.8, nthread=10, learning_rate=0.1)

# skf = StratifiedKFold(n_splits=10)
# pred_test =np.zeros((len(test),1))
# cv_score=[]
# for train_index,test_index in skf.split(X,y):
#     xtrain,xval = X.iloc[train_index],X.iloc[test_index]
#     ytrain,yval = y.iloc[train_index],y.iloc[test_index]
    
#     ctv.fit(list(xtrain) + list(xval)+list(test.text_clean))
#     xtrain_ctv =  ctv.transform(xtrain) 
#     xval_ctv = ctv.transform(xval)
#     test_ctv = ctv.transform(test.text_clean)
    
    
#     clf.fit(xtrain_ctv.tocsc(),ytrain)
#     score = accuracy_score(yval,clf.predict(xval_ctv.tocsc()))
#     cv_score.append(score)
#     print(round(score,3),end=",")

#     pred_test += clf.predict_proba(test_ctv.tocsc())[:,1].reshape(-1,1)
# pred_test=pred_test/10 
# print("\n",round(np.mean(cv_score),3))
# test['lgbm_ctv']=pred_test
# del(xtrain_ctv,xval_ctv,test_ctv,xtrain,ytrain,xval,yval)

In [None]:
embeddings_index = {}
f = open(os.path.expanduser('~/Desktop/glove.6B/glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(xtrain))

x_tr_seq  = tokenizer.texts_to_sequences(xtrain) 
x_val_seq = tokenizer.texts_to_sequences(xvalid)
test_seq = tokenizer.texts_to_sequences(test.text_clean)

x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)
test_seq = pad_sequences(test_seq,maxlen=100)

size_of_vocabulary=len(tokenizer.word_index) + 1
print(size_of_vocabulary)


embedding_matrix = np.zeros((size_of_vocabulary, 100))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

In [None]:
model=Sequential()

model.add(Embedding(size_of_vocabulary,100,weights=[embedding_matrix],input_length=100,trainable=False)) 


model.add(LSTM(128,return_sequences=True,dropout=0.2))
model.add(GlobalMaxPooling1D())

model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 


model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

history = model.fit(np.array(x_tr_seq),np.array(ytrain),\
                    batch_size=100,epochs=100,
                    validation_data=(np.array(x_val_seq),np.array(yvalid)),\
                    verbose=1,callbacks=[es,mc])

pred = model.predict(test_seq)
test['glove_keras'] = pred

In [None]:
test['target'] = (test['lr_tfv']+test['lgbm_tfv'])/2
temp = test.copy()
temp['target'] = np.where(temp['target']>=0.5,1,0)
temp.index=temp.id
temp = temp[['target']]
temp.to_csv('try2.csv')

In [None]:
temp.head()