In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
import unidecode

import warnings
warnings.filterwarnings('ignore')

stopwords = nltk.corpus.stopwords.words('english')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

In [4]:
def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [5]:
tweets["clean_text"] = tweets["text"].apply(lambda x: cleaning_text(x))
tweets.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


In [6]:
tweets.tail()

Unnamed: 0,id,keyword,location,text,target,clean_text
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding bridge collapse nearb...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,control wild fires california even northern pa...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc km volcano hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,police investigating ebike collided car little...
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1,latest homes razed northern california wildfir...


In [7]:
vectorizer = CountVectorizer()
df_text = tweets["clean_text"]

X = vectorizer.fit_transform(df_text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
feature_words = vectorizer.get_feature_names()
len(feature_words)

14350

In [9]:
df = pd.DataFrame(X.toarray(), columns=feature_words)
df.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
print(df.shape)
print(df.loc[:,(df.sum()>1)].shape)
print(df.loc[:,(df.sum()>2)].shape)
print(df.loc[:,(df.sum()>3)].shape)
print(df.loc[:,(df.sum()>5)].shape)
print(df.loc[:,(df.sum()>=10)].shape)

(7613, 14350)
(7613, 6199)
(7613, 4183)
(7613, 3229)
(7613, 2188)
(7613, 1416)


In [10]:
#Nos quedamos con las palabras que al menos tienen frecuencia >= 10
df_filter = df.loc[:,(df.sum()>=10)]
df_filter.head()

Unnamed: 0,aba,abandoned,abc,ablaze,able,absolutely,accident,according,account,across,...,yo,york,youll,young,youre,youth,youtube,yr,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_filter["target_(label)"] = tweets.target
df_filter.head(20)

Unnamed: 0,aba,abandoned,abc,ablaze,able,absolutely,accident,according,account,across,...,york,youll,young,youre,youth,youtube,yr,yyc,zone,target_(label)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
tweets.loc[:,["clean_text","target"]].head(9)

Unnamed: 0,clean_text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1
5,rockyfire update california hwy closed directi...,1
6,flood disaster heavy rain causes flash floodin...,1
7,im top hill see fire woods,1
8,theres emergency evacuation happening building...,1


In [21]:
tweets.iloc[8,5]

'theres emergency evacuation happening building across street'

In [12]:
#Construimos los datos de entrenamiento y de test
X = df_filter.drop(["target_(label)"], axis=1)
y = df_filter["target_(label)"]

In [24]:
print(X.shape)
print(y.shape)

(7613, 1416)
(7613,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 1416)
(1904, 1416)
(5709,)
(1904,)


#### Ajustando hiper-parametros (usando 1416 feature words):
    - n_estimators=10, max_depth=6, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.659139
    - n_estimators=10, max_depth=11, learning_rate=0.05, subsample=1, colsample_bytree=0.5
    SCORE 0.695378
    - n_estimators=10, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.704307
    - n_estimators=50, max_depth=9, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.742647
    - n_estimators=50, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.755252
    - n_estimators=50, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, alpha=0.1, lambda=0.1
    SCORE 0.753151
    - n_estimators=100, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, alpha=0.1, lambda=0.1
    SCORE 0.771534
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.5, alpha=0.1, lambda=0.1
    SCORE 0.783088
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7, lambda=1
    SCORE 0.774160
    Mejor SCORE: 0.783088

In [51]:
model_xgb = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', max_depth=15, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.7, reg_lambda=1, n_jobs=1)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=15,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=1, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [52]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.774160


In [45]:
#vamos a probar utilizando menos feature words por temas de performance
print(df.loc[:,(df.sum()==11)].shape[1])
print(df.loc[:,(df.sum()==12)].shape[1])
print(df.loc[:,(df.sum()==13)].shape[1])
print(df.loc[:,(df.sum()==14)].shape[1])
print(df.loc[:,(df.sum()==15)].shape[1])
print(df.loc[:,(df.sum()>15)].shape[1])

94
92
80
55
48
936


In [46]:
print(df.loc[:,(df.sum()>=15)].shape[1])
print(df.loc[:,(df.sum()>=20)].shape[1])

984
741


In [15]:
df_filter = df.loc[:,(df.sum()>=20)]  #tomo las feature words con frecuencia 20 o mas
df_filter["target_(label)"] = tweets.target
X = df_filter.drop(["target_(label)"], axis=1)
y = df_filter["target_(label)"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

#### Ajustando hiper-parametros (usando 741 feature words):
    - n_estimators=10, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.713761
    - n_estimators=50, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.754202
    - n_estimators=50, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.766282
    - n_estimators=100, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.774685
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.780462
    - n_estimators=100, max_depth=15, learning_rate=0.1, subsample=1, colsample_bytree=0.7, lambda=0.1
    SCORE 0.783613
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, lambda=0.1
    SCORE 0.787815
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.7, lambda=0.1
    SCORE 0.783613
    - n_estimators=200, max_depth=11, learning_rate=0.1, subsample=1, colsample_bytree=0.5, lambda=0.5
    SCORE 0.783613
    Mejor SCORE: 0.787815

In [76]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.5, n_jobs=1)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=200, n_jobs=1, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=0.5, scale_pos_weight=1, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None)

In [77]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.787290


In [None]:
#probar en otra notebook buscando las mejores feature-words utilizando reg logistica tomando el df.sum()>1 (shape=7613,6199)

In [17]:
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', max_depth=11, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.5, reg_lambda=0.5, n_jobs=1)

In [19]:
kfold = KFold(n_splits=4, random_state=132)
resultados = cross_val_score(model_xgb, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()*100))

Accuracy: 77.824467


#### XGBoost c/bag of words y cross validation 4-fold - mejor resultado: 0.778245