In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
import unidecode

import warnings
warnings.filterwarnings('ignore')

stopwords = nltk.corpus.stopwords.words('english')

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

In [4]:
def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [5]:
train["clean_text"] = train["text"].apply(lambda x: cleaning_text(x))
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


In [7]:
train.rename(columns={"target":"target_label"}, inplace=True)
train.rename(columns={"location":"location_original"}, inplace=True)
train.rename(columns={"id":"id_original"}, inplace=True)
train.rename(columns={"text":"text_original"}, inplace=True)
train.rename(columns={"keyword":"keyword_original"}, inplace=True)

In [6]:
test["clean_text"] = test["text"].apply(lambda x: cleaning_text(x))
test.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwan


In [8]:
test.rename(columns={"location":"location_original"}, inplace=True)
test.rename(columns={"id":"id_original"}, inplace=True)
test.rename(columns={"text":"text_original"}, inplace=True)
test.rename(columns={"keyword":"keyword_original"}, inplace=True)

### Bag of Words

In [31]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(train["clean_text"])

vec_train = vectorizer.fit_transform(train["clean_text"])
feature_words = vectorizer.get_feature_names()
print(len(feature_words))
df_bow_train = pd.DataFrame(vec_train.toarray(), columns=feature_words)
df_bow_train.head()

14190


Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
vec_test = vectorizer.transform(test["clean_text"])
feature_words = vectorizer.get_feature_names()
print(len(feature_words))
df_bow_test = pd.DataFrame(vec_test.toarray(), columns=feature_words)
df_bow_test.head()

14190


Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
print(df_bow_train.shape)
print(df_bow_test.shape)

(7613, 14190)
(3263, 14190)


In [12]:
print(df_bow_train.shape)
print(df_bow_train.loc[:,(df_bow_train.sum()>1)].shape)
print(df_bow_train.loc[:,(df_bow_train.sum()>2)].shape)
print(df_bow_train.loc[:,(df_bow_train.sum()>3)].shape)
print(df_bow_train.loc[:,(df_bow_train.sum()>5)].shape)
print(df_bow_train.loc[:,(df_bow_train.sum()>=10)].shape)

(7613, 14190)
(7613, 6049)
(7613, 4042)
(7613, 3101)
(7613, 2080)
(7613, 1323)


In [35]:
#Nos quedamos con las palabras que al menos tienen frecuencia >= 10
df_filter_train = df_bow_train.loc[:,(df_bow_train.sum()>=10)]
df_filter_train.head()

Unnamed: 0,aba,abandoned,abc,ablaze,able,absolutely,accident,according,account,act,...,yo,york,youll,young,youre,youth,youtube,yr,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df_filter_test = df_bow_test.loc[:,(df_bow_test.sum()>=10)]
df_filter_test.head()

Unnamed: 0,abc,ablaze,accident,actually,affected,aftershock,ago,air,airplane,airport,...,wrecked,yeah,year,yearold,years,yes,yesterday,youre,yr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
train_final = pd.concat([train,df_filter_train], axis="columns")
print(train_final.shape)
test_final = pd.concat([test,df_filter_test], axis="columns")
print(test_final.shape)

(7613, 1329)
(3263, 613)


In [16]:
#Construimos los datos de entrenamiento y de test
X = train_final.drop(["id_original","keyword_original","location_original","text_original","target_label",
                      "clean_text"], axis=1)
y = train_final["target_label"]

In [17]:
print(X.shape)
print(y.shape)

(7613, 1323)
(7613,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 1323)
(1904, 1323)
(5709,)
(1904,)


In [24]:
model = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.7, n_jobs=1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.788340


In [51]:
X = vec_train
y = train_final["target_label"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [53]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 14190)
(1904, 14190)
(5709,)
(1904,)


In [58]:
model = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.7, n_jobs=1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [59]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.804622


In [60]:
X_test = vec_test

In [61]:
model = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.7, n_jobs=1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [62]:
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [64]:
test["target"] = y_pred
test[["id_original","target"]]

Unnamed: 0,id_original,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [65]:
test[["id_original","target"]].rename(columns={"id_original":"id"}).to_csv("../data/pred3_XGB_BoW", index=False)

#### Ajustando hiper-parametros (usando 741 feature words):
    - n_estimators=10, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.713761
    - n_estimators=50, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.754202
    - n_estimators=50, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.766282
    - n_estimators=100, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.774685
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.780462
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7, lambda=0.1
    SCORE 0.783613
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, lambda=0.1
    SCORE 0.787815
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7, lambda=0.1
    SCORE 0.783613
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, lambda=0.5
    SCORE 0.783613
    Mejor SCORE: 0.787815

In [26]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.1, n_jobs=1)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=0.1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.775735


In [None]:
#probar en otra notebook buscando las mejores feature-words utilizando reg logistica tomando el df.sum()>1 (shape=7613,6199)

In [17]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.5, n_jobs=1)

In [19]:
kfold = KFold(n_splits=4, random_state=132)
resultados = cross_val_score(model_xgb, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()*100))

Accuracy: 77.824467


#### XGBoost c/bag of words y cross validation 4-fold - mejor resultado: 0.778245

In [11]:
#prediciendo test.csv
tweets_test = pd.read_csv("../data/test.csv")

In [12]:
tweets_test["clean_text"] = tweets_test["text"].apply(lambda x: cleaning_text(x))
tweets_test.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwan


In [13]:
vectorizer = CountVectorizer()
X_t = vectorizer.fit_transform(tweets_test["clean_text"])
X_t.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
feature_words = vectorizer.get_feature_names()
len(feature_words)

8819

In [15]:
df_test = pd.DataFrame(X_t.toarray(), columns=feature_words)
df_test.head()

Unnamed: 0,aa,aaaaaa,aabn,aapatwork,aaron,aba,abandoned,abandons,abba,abbog,...,zirngast,zix,zombie,zombies,zone,zones,zouis,zouma,zuma,zx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_filter_test = df_test.loc[:,(df_test.sum()>=10)]
df_filter_test.head()

Unnamed: 0,abc,ablaze,accident,actually,affected,aftershock,ago,air,airplane,airport,...,yeah,year,yearold,years,yes,yesterday,yet,youre,yr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X_train = df_filter.drop(["target_(label)"], axis=1)
y = df_filter["target_(label)"]

In [29]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.5, n_jobs=1)

In [30]:
kfold = KFold(n_splits=4, random_state=100)
resultados = cross_val_score(model_xgb, X_train, y, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.709703


In [31]:
kfold = StratifiedKFold(n_splits=4, random_state=100)
resultados = cross_val_score(model_xgb, X_train, y, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.671348


In [32]:
#Voy a probar otra manera
vectorizer = CountVectorizer()
vectorizer.fit(tweets["clean_text"])
# X_t = vectorizer.fit_transform(tweets_test["clean_text"])
# X_t.toarray()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [33]:
vec_train = vectorizer.fit_transform(tweets["clean_text"])
vec_test = vectorizer.fit_transform(tweets_test["clean_text"])

In [34]:
print(vec_train[5].todense())

[[0 0 0 ... 0 0 0]]


In [35]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.5, n_jobs=1)

In [36]:
kfold = KFold(n_splits=4, random_state=100)
resultados = cross_val_score(model_xgb, vec_train, y, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.703398
