In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import re
import nltk
from nltk.tokenize import TweetTokenizer
import string
import unidecode

import warnings
warnings.filterwarnings('ignore')

stopwords = nltk.corpus.stopwords.words('english')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
def concatenate(x,char):
    words = ""
    for word in x:
        if word.startswith(char):
            words = words + word + " "
    return words

def count_vowels(x):
    return (x.count('a') + x.count('e') + x.count('i') + x.count('o') + x.count('u'))

def count_short_words(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if 1 <= len(word) <= 3:
            count += 1
    return count

def count_stopwords(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if word in stopwords:
            count += 1
    return count

In [4]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [5]:
tweets["special_chars_count"] =  tweets["text"]
tweets["special_chars_count"] =  tweets["special_chars_count"].str.lower()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.strip()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(' +','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.len()

tweets["hashtags"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
tweets["labels"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
tweets["hashtags_count"] = tweets["hashtags"].str.split(' ').apply(lambda x: len(x))-1
tweets["labels_count"] = tweets["labels"].str.split(' ').apply(lambda x: len(x))-1

tweets["num_chars_count"] = tweets["text"]
tweets["num_chars_count"] =  tweets["num_chars_count"].str.lower()
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].str.strip()
tweets["num_chars_count"] = tweets["num_chars_count"].str.len()

tweets["clean_text"] = tweets["text"].apply(lambda x: cleaning_text(x))

tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()

tweets["vowels_count"] = tweets["text"].apply(lambda x: count_vowels(x))
tweets["short_words_count"] = tweets["text"].apply(lambda x: count_short_words(x))
tweets["stopwords_count"] = tweets["text"].apply(lambda x: count_stopwords(x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

tweets.rename(columns={"target":"target_label"}, inplace=True)
tweets.head()

Unnamed: 0,id,keyword,location,text,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,clean_text,text_length,vowels_count,short_words_count,stopwords_count,words_count
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,0,deeds reason earthquake may allah forgive us,68,25,7,6,13
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,0,forest fire near la ronge sask canada,37,13,1,0,7
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,0,residents asked shelter place notified officer...,130,45,9,11,22
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,5,people receive wildfires evacuation orders cal...,56,24,1,1,7
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,0,got sent photo ruby alaska smoke wildfires pou...,85,25,3,7,16


In [6]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')

In [7]:
#One Hot Encoding
dummies = pd.get_dummies(tweets["keyword"], prefix="keyword")
dummies.columns

Index(['keyword_ablaze', 'keyword_accident', 'keyword_aftershock',
       'keyword_airplane accident', 'keyword_ambulance', 'keyword_annihilated',
       'keyword_annihilation', 'keyword_apocalypse', 'keyword_armageddon',
       'keyword_army',
       ...
       'keyword_weapons', 'keyword_whirlwind', 'keyword_wild fires',
       'keyword_wildfire', 'keyword_windstorm', 'keyword_wounded',
       'keyword_wounds', 'keyword_wreck', 'keyword_wreckage',
       'keyword_wrecked'],
      dtype='object', length=221)

In [8]:
tweets_ohe = pd.concat([tweets,dummies], axis="columns")
tweets_ohe.shape

(7613, 238)

In [9]:
#BOW
vectorizer = CountVectorizer(stop_words='english')
df_text = tweets["clean_text"]
X = vectorizer.fit_transform(df_text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
feature_words = vectorizer.get_feature_names()
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_words.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_filter = df_words.loc[:,(df_words.sum()>5)]
df_filter.shape

(7613, 2080)

In [12]:
tweets_final = pd.concat([tweets_ohe,df_filter], axis="columns")
tweets_final.shape

(7613, 2318)

In [13]:
tweets_final.columns

Index(['id', 'keyword', 'location', 'text', 'target_label',
       'special_chars_count', 'hashtags', 'labels', 'hashtags_count',
       'labels_count',
       ...
       'young', 'youre', 'youth', 'youtube', 'youve', 'yr', 'yrs', 'yyc',
       'zombie', 'zone'],
      dtype='object', length=2318)

In [14]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [15]:
print(X.shape)
print(y.shape)

(7613, 2308)
(7613,)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 2308)
(1904, 2308)
(5709,)
(1904,)


In [18]:
#usando las 2308 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.787290


In [20]:
model_xgb.feature_importances_

array([0.00190368, 0.00159241, 0.00504938, ..., 0.0025172 , 0.        ,
       0.        ], dtype=float32)

In [21]:
df_feat_importances = pd.DataFrame(model_xgb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances.importancia.value_counts()

0.000000    1985
0.002930       1
0.002050       1
0.002889       1
0.003072       1
            ... 
0.003383       1
0.001356       1
0.004263       1
0.002116       1
0.001778       1
Name: importancia, Length: 324, dtype: int64

In [22]:
#primero usemos 700 features
#luego las 323 diferentes a 0

In [23]:
list_fi = df_feat_importances.index[:700].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 700)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [25]:
#usando 700 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.790441


In [47]:
model_xgb = xgb.XGBClassifier(n_estimators=250, colsample_bytree=0.5, learning_rate=0.1, max_depth=13)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=13,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [48]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.790441


In [46]:
hiper_parametros = {"n_estimators":[500],
                   "max_depth":[6,7,8],
                   "colsample_bytree":[0.5,0.7],
                   "subsample":[1],
                    "n_jobs": [1]}

In [50]:
clasif = GridSearchCV(model_xgb, hiper_parametros, cv=2, scoring='accuracy')
clasif.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.5, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=13, min_child_weight=1,
                                     missing=nan, monotone_constraints='()',
                                     n_estimators=250, n_jobs=0,
                                     n...
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact', validate_parameters=1,
                                     verbosity=Non

In [51]:
print(clasif.best_estimator_)
print(clasif.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
0.7829727411185579


In [52]:
#usando 323 features
list_fi = df_feat_importances.index[:323].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 323)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [54]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [55]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.789391


In [66]:
model_xgb = xgb.XGBClassifier(n_estimators=550, colsample_bytree=0.5, learning_rate=0.1, max_depth=7)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=550, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [67]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.797794


In [68]:
#TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(tweets["clean_text"])
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [69]:
feature_words = vectorizer.get_feature_names()
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_words.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
df_filter = df_words.loc[:,(df_words.sum()>2)]
df_filter.shape

(7613, 2072)

In [71]:
tweets_final = pd.concat([tweets_ohe,df_filter], axis="columns")
tweets_final.shape

(7613, 2310)

In [72]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [74]:
#usando las 2300 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [75]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.784664


In [76]:
df_feat_importances = pd.DataFrame(model_xgb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances["importancia"].value_counts()

0.000000    1964
0.004863       1
0.004618       1
0.001681       1
0.002477       1
            ... 
0.002940       1
0.001524       1
0.001775       1
0.004782       1
0.001461       1
Name: importancia, Length: 337, dtype: int64

In [78]:
list_fi = df_feat_importances.index[:346].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 346)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [102]:
#usando las 346 features
model_xgb = xgb.XGBClassifier(n_estimators=200, colsample_bytree=0.5, learning_rate=0.1, max_depth=13)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=13,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [103]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.795693
