In [147]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unidecode
from sklearn.model_selection import KFold

plt.style.use('default')
plt.rcParams['figure.figsize'] = (8,5)

sns.set(style="whitegrid")

pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null category
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: category(1), int64(2), object(2)
memory usage: 264.6+ KB


In [4]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

In [6]:
by_keyword = tweets.groupby("keyword").agg({"target":"mean"}).reset_index()
by_keyword.rename(columns={"target":"keyword_mean"}, inplace=True)
by_keyword.head()

Unnamed: 0,keyword,keyword_mean
0,ablaze,0.36
1,accident,0.69
2,aftershock,0.0
3,airplane accident,0.86
4,ambulance,0.53


In [13]:
tweets_me = pd.merge(tweets, by_keyword, on="keyword", how="left")
tweets_me["keyword_mean"] = tweets_me["keyword_mean"].fillna(value=0)
tweets_me.head()

Unnamed: 0,id,keyword,location,text,target,text_length,words_count,keyword_mean
0,1,,,our deeds are the reason of this earthquake ma...,1,68,13,0.0
1,4,,,forest fire near la ronge sask canada,1,37,7,0.0
2,5,,,all residents asked to shelter in place are be...,1,130,22,0.0
3,6,,,people receive wildfires evacuation orders in ...,1,56,7,0.0
4,7,,,just got sent this photo from ruby alaska as s...,1,85,16,0.0


In [14]:
tweets_me["keyword_mean"].value_counts().nlargest()

0.15    170
0.67    165
0.14    147
0.15    132
0.12    120
Name: keyword_mean, dtype: int64

In [15]:
#Construimos los datos de entrenamiento y de test
X = tweets_me.loc[:,["text_length", "words_count", "keyword_mean"]]
y = tweets_me["target"]

In [16]:
X.head()

Unnamed: 0,text_length,words_count,keyword_mean
0,68,13,0.0
1,37,7,0.0
2,130,22,0.0
3,56,7,0.0
4,85,16,0.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 3)
(1904, 3)
(5709,)
(1904,)


#### Ajustando hiper-parametros (usando 25% de set de test):
    - n_estimators=10, max_depth=7, min_samples_split=10, min_samples_leaf=1
    SCORE 0.744748
    - n_estimators=20, max_depth=7, min_samples_split=10, min_samples_leaf=1
    SCORE 0.748424
    - n_estimators=50, max_depth=7, min_samples_split=10, min_samples_leaf=1
    SCORE 0.746849
    - n_estimators=100, max_depth=7, min_samples_split=10, min_samples_leaf=1
    SCORE 0.747374
    - n_estimators=200, max_depth=7, min_samples_split=10, min_samples_leaf=1
    SCORE 0.744748
    - n_estimators=20, max_depth=7, min_samples_split=35, min_samples_leaf=25
    SCORE 0.752101
    - n_estimators=50, max_depth=13, min_samples_split=30, min_samples_leaf=15
    SCORE 0.755777
    - n_estimators=75, max_depth=9, min_samples_split=25, min_samples_leaf=15
    SCORE 0.754727
    - n_estimators=100, max_depth=13, min_samples_split=30, min_samples_leaf=15
    SCORE 0.753151
    - n_estimators=150, max_depth=11, min_samples_split=25, min_samples_leaf=5
    SCORE 0.756828
    - n_estimators=200, max_depth=15, min_samples_split=10, min_samples_leaf=5
    SCORE 0.757878
    - n_estimators=300, max_depth=11, min_samples_split=10, min_samples_leaf=5
    SCORE 0.760504

In [139]:
model_rf = RandomForestClassifier(n_estimators=300, max_depth=11, min_samples_split=10, min_samples_leaf=5,random_state=123)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [140]:
y_test_hat = model_rf.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.760504


In [141]:
model_rf.score(X_test, y_test)*100

76.05042016806722

In [142]:
#Utilizamos Cross-validation
kfold = KFold(n_splits=4, random_state=123)

In [143]:
for train_idx, test_idx in kfold.split([1,2,3,4,5,6,7,8]):
    print(train_idx, test_idx)

[2 3 4 5 6 7] [0 1]
[0 1 4 5 6 7] [2 3]
[0 1 2 3 6 7] [4 5]
[0 1 2 3 4 5] [6 7]


In [151]:
# scores = []

# for train_idx, test_idx in kfold.split(X):
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]
#     #Entrenamos modelo
#     model_rf.fit(X_train, y_train)
#     #Calculamos el puntaje
#     scores.append(model_rf.score(X_test, y_test))

In [149]:
cv_resultados = cross_val_score(model_rf, X_train, y_train, cv=kfold, scoring='accuracy')
print(cv_resultados.mean())

0.7339296265107305


In [152]:
kfold = KFold(n_splits=8, random_state=123)
cv_resultados = cross_val_score(model_rf, X_train, y_train, cv=kfold, scoring='accuracy')
print(cv_resultados.mean())

0.7405938532495748


In [154]:
kfold = KFold(n_splits=3)
cv_resultados = cross_val_score(model_rf, X_train, y_train, cv=kfold, scoring='accuracy')
print(cv_resultados.mean())

0.7365563145909967


#### RandomForest - mejor resultado: 0.760504
#### RandomForest c/ cross-valid - mejor resultado: 0.740594