## Load Data

In [87]:
# !pip install nltk

In [122]:
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = stopwords.words('english')

In [89]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print(df_train.describe())
print(df_train.head())

                 id      target
count   7613.000000  7613.00000
mean    5441.934848     0.42966
std     3137.116090     0.49506
min        1.000000     0.00000
25%     2734.000000     0.00000
50%     5408.000000     0.00000
75%     8146.000000     1.00000
max    10873.000000     1.00000
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [90]:
df_train[df_train['target'] == 0]['text'].values[0]

"What's up man?"

In [91]:
df_train[df_train['target'] == 1]['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [92]:
df_train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [93]:
print(df_train['target'].value_counts())

0    4342
1    3271
Name: target, dtype: int64


In [109]:
x_train, x_valid, y_train, y_valid = train_test_split(df_train['text'].values, df_train['target'].values, 
                                                      stratify=df_train['target'].values, 
                                                     random_state=42, test_size=0.1, shuffle=True)

In [110]:
print(x_train.shape)
print(x_valid.shape)

(6851,)
(762,)


## Data Preprocessing

In [96]:
porter = PorterStemmer()
sw = stopwords.words('english')

def porter_stemmer(text):
    return [porter.stem(word) for word in text.split() if word not in sw]

In [106]:
# tf-idf data
tfidf_vectorizer = TfidfVectorizer(min_df=3, max_features=None, stop_words='english', strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), use_idf=1, smooth_idf=1, sublinear_tf=1)
tfidf_vectorizer.fit(list(x_train) + list(x_valid))

x_train_tfv = tfidf_vectorizer.transform(x_train)
x_valid_tfv = tfidf_vectorizer.transform(x_valid)
print(x_train_tfv[0].todense().shape)
print(x_valid_tfv[0].todense())

(1, 9229)
[[0. 0. 0. ... 0. 0. 0.]]


In [107]:
# test_vectors = tfidf_vectorizer.transform(df_test['text'])
# print(test_vectors[0].todense().shape)
# print(test_vectors[0].todense())

In [113]:
# Count vectorizer data
count_vtz = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
count_vtz.fit(list(x_train) + list(x_valid))
x_train_ctvz =  count_vtz.transform(x_train) 
x_valid_ctvz = count_vtz.transform(x_valid)
print(x_train_ctvz[0].todense().shape)
print(x_valid_ctvz[0].todense())

(1, 132070)
[[0 0 0 ... 0 0 0]]


In [124]:
# Since SVMs take a lot of time, we will reduce the number of features from the TF-IDF using Singular Value Decomposition before applying SVM.
# Also, note that before applying SVMs, we must standardize the data.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)
x_train_svd = svd.transform(x_train_tfv)
x_valid_svd = svd.transform(x_valid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(x_train_svd)
xtrain_svd_scl = scl.transform(x_train_svd)
xvalid_svd_scl = scl.transform(x_valid_svd)

## Train Models

In [117]:
import xgboost
print(xgboost.__version__)
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

1.3.3


In [133]:
# Create models
xgb_clf = XGBClassifier(booster='gbtree', objective="binary:logistic", max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, random_state=42)
lr = LogisticRegression(C=1.0, random_state=42)
nb = MultinomialNB()
svc = SVC(C=1.0, probability=True, random_state=42) 
models = [('Logistic Regression', lr), ('Multinomial Naive Bayes', nb), ('XGBoost',xgb_clf), ('SVC', svc)]

In [131]:
# Cross-val tf-idf
for name, algo in models:
    if name == 'SVC':
        scores = model_selection.cross_val_score(algo, xtrain_svd_scl, y_train, cv=3, scoring='f1')
    else:
        scores = model_selection.cross_val_score(algo, x_train_tfv, y_train, cv=3, scoring='f1')
    print(f'{name}: {scores}')

Logistic Regression: [0.72157773 0.7260274  0.73190045]
Multinomial Naive Bayes: [0.70644967 0.72289157 0.72264932]
XGBoost: [0.69371429 0.68284424 0.72460497]
SVC: [0.67792662 0.70778098 0.71478463]


In [135]:
# Cross-val count vectorizer
for name, algo in models:
    if name == 'SVC':
        scores = model_selection.cross_val_score(algo, xtrain_svd_scl, y_train, cv=3, scoring='f1')
    else:
        scores = model_selection.cross_val_score(algo, x_train_ctvz, y_train, cv=3, scoring='f1')
    print(f'{name}: {scores}')

Logistic Regression: [0.70767428 0.71403611 0.73048433]
Multinomial Naive Bayes: [0.73678756 0.75218061 0.7429451 ]
XGBoost: [0.69103774 0.71126358 0.71777003]
SVC: [0.67792662 0.70778098 0.71478463]


# Grid Search

In [None]:
# TODO

# Submit

In [136]:
# sample_submission = pd.read_csv('sample_submission.csv')

In [137]:
# sample_submission['target'] = clf.predict(test_vectors)
# sample_submission.head()

In [138]:
# sample_submission.to_csv('submission.csv', index=False)