In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import lightgbm as lgb 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unidecode

plt.style.use('default')
plt.rcParams['figure.figsize'] = (8,5)

sns.set(style="whitegrid")

pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings('ignore')

In [3]:
tweets = pd.read_csv("../data/train.csv")

In [4]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

In [5]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')

In [6]:
by_keyword = tweets.groupby("keyword").agg({"target":"mean"}).reset_index()
by_keyword.rename(columns={"target":"keyword_mean"}, inplace=True)
by_keyword.head()

Unnamed: 0,keyword,keyword_mean
0,ablaze,0.36
1,accident,0.69
2,aftershock,0.0
3,airplane accident,0.86
4,ambulance,0.53


In [17]:
tweets_me = pd.merge(tweets, by_keyword, on="keyword", how="left")
tweets_me["keyword_mean"] = tweets_me["keyword_mean"].fillna(value=0) #LGBM no se lleva bien con los NaNs
tweets_me.head()

Unnamed: 0,id,keyword,location,text,target,text_length,words_count,keyword_mean
0,1,,,our deeds are the reason of this earthquake ma...,1,68,13,0.0
1,4,,,forest fire near la ronge sask canada,1,37,7,0.0
2,5,,,all residents asked to shelter in place are be...,1,130,22,0.0
3,6,,,people receive wildfires evacuation orders in ...,1,56,7,0.0
4,7,,,just got sent this photo from ruby alaska as s...,1,85,16,0.0


In [18]:
#Construimos los datos de entrenamiento y de test
X = tweets_me.loc[:,["text_length", "words_count", "keyword_mean"]]
y = tweets_me["target"]

In [19]:
X.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_length,68.0,37.0,130.0,56.0,85.0,98.0,92.0,56.0,79.0,49.0,43.0,127.0,63.0,37.0,51.0,13.0,13.0,16.0,17.0,22.0
words_count,13.0,7.0,22.0,7.0,16.0,15.0,14.0,16.0,13.0,11.0,9.0,27.0,11.0,7.0,10.0,4.0,3.0,3.0,5.0,3.0
keyword_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 3)
(1904, 3)
(5709,)
(1904,)


#### Ajustando hiper-parametros:
    - n_estimators=100, learning_rate=0.1, max_depth=-1, colsample_bytree=1, min_child_samples=20, subsample=1, num_leaves=31 --> SCORE: 0.751576  (valores por default)
    - n_estimators=100, learning_rate=0.1, max_depth=4, colsample_bytree=1, min_child_samples=20, subsample=1, num_leaves=7 --> SCORE: 0.745139
    - n_estimators=100, learning_rate=0.1, max_depth=4, colsample_bytree=0.3, min_child_samples=20, subsample=1, num_leaves=7 --> SCORE: 0.746015
    - n_estimators=100, learning_rate=0.1, max_depth=4, colsample_bytree=0.3, min_child_samples=20, subsample=1, num_leaves=7, lambda=0.1 --> SCORE: 0.746365

In [22]:
model_lgb = lgb.LGBMClassifier()
model_lgb.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [23]:
y_test_hat = model_lgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.751576


In [140]:
#Usaremos grid-search CV para tunear mejor el modelo   #num_leaves: nro de hojas
hiper_parametros = {"num_leaves":[7],
                   "max_depth":[4],
                    "n_estimators":[500],
                   "colsample_bytree":[0.3,0.7],
                   "reg_lambda":[0.1,0.2,0.05],
                    "n_jobs": [-1,4]}

In [145]:
clasif = GridSearchCV(model_lgb, hiper_parametros, cv=8, scoring='accuracy')

In [146]:
clasif.fit(X_train, y_train)

GridSearchCV(cv=8, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.1, max_depth=4,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=4, num_leaves=7, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'num_leaves': [7], 'max_depth': [4], 'n_estimators': [500], 'colsample_bytree': [0.3, 0.7], 'reg_lambda': [0.1, 0.2, 0.05], 'n_jobs': [-1, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [147]:
print(clasif.best_estimator_)
print(clasif.best_score_)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.1, max_depth=4,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=7, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.05, silent=True,
        subsample=1, subsample_for_bin=200000, subsample_freq=0)
0.7381327728148538


In [137]:
model_lgb = lgb.LGBMClassifier(num_leaves=7, objective="binary", colsample_bytree=0.3, subsample=1, max_depth=4, n_jobs=4,
                              n_estimators=500, min_child_samples=20, learning_rate=0.1)
model_lgb.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.1, max_depth=4,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=4, num_leaves=7, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [138]:
y_test_hat = model_lgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.753676
