## Load Data

In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print(df_train.describe())
print(df_train.head())

                 id      target
count   7613.000000  7613.00000
mean    5441.934848     0.42966
std     3137.116090     0.49506
min        1.000000     0.00000
25%     2734.000000     0.00000
50%     5408.000000     0.00000
75%     8146.000000     1.00000
max    10873.000000     1.00000
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [3]:
df_train[df_train['target'] == 0]['text'].values[0]

"What's up man?"

In [4]:
df_train[df_train['target'] == 1]['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [5]:
df_train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [19]:
print(df_train['text'].values)

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'
 'Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
 ... 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'
 'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'
 'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']


## Create Features

In [6]:
# let's get counts for the first 5 tweets in the data
count_vectorizer = feature_extraction.text.CountVectorizer()
example_train_vectors = count_vectorizer.fit_transform(df_train["text"][0:5])

## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [7]:
# let's get counts for all tweets in the data
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_train['text'])
test_vectors = count_vectorizer.transform(df_test['text'])

print(train_vectors[0].todense().shape)
print(test_vectors[0].todense().shape)

(1, 21637)
(1, 21637)


In [18]:
print(sample_submission['target'])
print(df_train['target'].describe())
print(pd.DataFrame(train_vectors).describe())

0       0
1       1
2       1
3       0
4       1
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      1
16      0
17      1
18      0
19      0
20      0
21      0
22      0
23      1
24      0
25      0
26      0
27      0
28      0
29      1
       ..
3233    1
3234    1
3235    1
3236    1
3237    1
3238    1
3239    1
3240    1
3241    0
3242    0
3243    1
3244    0
3245    0
3246    0
3247    0
3248    0
3249    0
3250    0
3251    0
3252    1
3253    1
3254    1
3255    1
3256    1
3257    1
3258    1
3259    1
3260    1
3261    1
3262    0
Name: target, Length: 3263, dtype: int64
count    7613.00000
mean        0.42966
std         0.49506
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: target, dtype: float64
             0            1            2            3            4      \
count  7613.000000  7613.000000  7613.000000  7613.000000  7613.000000   
mean      0.

## Train Models

In [8]:
import xgboost
print(xgboost.__version__)
from xgboost import XGBClassifier

1.3.3


In [9]:
classifier = XGBClassifier(objective="reg:squarederror",
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample = 0.8,
                    colsample_bytree = 0.8,
                    use_label_encoder=False,
                    scale_pos_weight = 1,
                    cv=3,
                    random_state=42)

In [10]:
scores = model_selection.cross_val_score(XGBClassifier(objective='reg:squarederror', use_label_encoder=False,), train_vectors, df_train['target'], cv=3, scoring='f1')
print(scores)

[0.59610028 0.54219949 0.63914522]


In [11]:
# Make submission
classifier.fit(train_vectors, df_train["target"])

sample_submission = pd.read_csv('sample_submission.csv')

sample_submission['target'] = classifier.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv('submission.csv', index=False)

Parameters: { cv } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [12]:
print(sample_submission['target'])

0       0
1       1
2       1
3       0
4       1
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      1
16      0
17      1
18      0
19      0
20      0
21      0
22      0
23      1
24      0
25      0
26      0
27      0
28      0
29      1
       ..
3233    1
3234    1
3235    1
3236    1
3237    1
3238    1
3239    1
3240    1
3241    0
3242    0
3243    1
3244    0
3245    0
3246    0
3247    0
3248    0
3249    0
3250    0
3251    0
3252    1
3253    1
3254    1
3255    1
3256    1
3257    1
3258    1
3259    1
3260    1
3261    1
3262    0
Name: target, Length: 3263, dtype: int64


## Find Best Model

In [51]:
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV, KFold

from sklearn.metrics import roc_curve, roc_auc_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, classification_report

In [14]:
# Assumption: max_depth and max_leaf_nodes are somewhat redundant as pruning techniques
# Reference: https://datascience.stackexchange.com/questions/29520/how-to-plot-learning-curve-and-validation-curve-while-using-pipeline
def find_best_estimator(estimator, scoring, X_train, X_test, y_train, y_test, **parameters):
    # Search for near-optimal hyperparameters
    cv = KFold(n_splits=5, shuffle=True)

    rs_cv = RandomizedSearchCV(estimator=estimator, param_distributions=parameters, 
                                scoring=scoring, n_jobs=5, refit=True, cv=cv, verbose=1, 
                                random_state=0, return_train_score=True)
    rs_cv = rs_cv.fit(X_train, y_train)
    print('Tuned randomized search best parameters: {}'.format(rs_cv.best_params_))

    print('Training Report')
    ypred = rs_cv.predict(X_train)
    print(classification_report(y_train, ypred))
    print('\n')
    print('Testing Report')
    ypred2 = rs_cv.predict(X_test)
    print(classification_report(y_test, ypred2))

    return rs_cv, X_train, X_test, y_train, y_test

def create_learning_curve(scoring, s_cv, X_train, y_train):
    cv = KFold(n_splits=5, shuffle=True)
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=s_cv.best_estimator_, X=X_train, y=y_train, random_state=0,
        train_sizes=np.arange(0.05, 1.05, 0.1), cv=cv, scoring=scoring, n_jobs=2)
    
    return train_sizes, train_scores, test_scores

def create_validation_curve(scoring, s_cv, X_train, y_train, param_name, param_range):
    cv = KFold(n_splits=5, shuffle=True)
    
    train_scores, test_scores = validation_curve(
        estimator=s_cv.best_estimator_, X=X_train, y=y_train, 
        param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=2)

    return train_scores, test_scores
    
def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')

    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F1')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()

def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    sort_idx = np.argsort(param_range)
    param_range=np.array(param_range)[sort_idx]
    train_mean = np.mean(train_scores, axis=1)[sort_idx]
    train_std = np.std(train_scores, axis=1)[sort_idx]
    test_mean = np.mean(test_scores, axis=1)[sort_idx]
    test_std = np.std(test_scores, axis=1)[sort_idx]
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter Value')
    plt.ylabel('Average values and standard deviation')
    plt.legend(loc='best')
    plt.show()

"""
# Learning curves
train_sizes, train_scores, test_scores = create_learning_curve(
    df, pl_classifier, scoring, rs_cv, X_train, y_train)

plot_learning_curve(train_sizes, train_scores, test_scores, 'Learning Curve', alpha=0.1)

# Model complexity curves
# parameter: max_depth
param_name = 'classifier__max_depth'
param_range = np.arange(3, 30, 3)
train_scores, test_scores = create_validation_curve(
    df, pl_classifier, scoring, rs_cv, X_train, y_train, param_name, param_range)

plot_validation_curve(param_range, train_scores, test_scores, param_name, alpha=0.1)

# parameter: min_samples_leaf
param_name = 'classifier__min_samples_leaf'
param_range = np.arange(1, 31, 5)
train_scores, test_scores = create_validation_curve(
    df, pl_classifier, scoring, rs_cv, X_train, y_train, param_name, param_range)

plot_validation_curve(param_range, train_scores, test_scores, param_name, alpha=0.1)
"""

"\n# Learning curves\ntrain_sizes, train_scores, test_scores = create_learning_curve(\n    df, pl_classifier, scoring, rs_cv, X_train, y_train)\n\nplot_learning_curve(train_sizes, train_scores, test_scores, 'Learning Curve', alpha=0.1)\n\n# Model complexity curves\n# parameter: max_depth\nparam_name = 'classifier__max_depth'\nparam_range = np.arange(3, 30, 3)\ntrain_scores, test_scores = create_validation_curve(\n    df, pl_classifier, scoring, rs_cv, X_train, y_train, param_name, param_range)\n\nplot_validation_curve(param_range, train_scores, test_scores, param_name, alpha=0.1)\n\n# parameter: min_samples_leaf\nparam_name = 'classifier__min_samples_leaf'\nparam_range = np.arange(1, 31, 5)\ntrain_scores, test_scores = create_validation_curve(\n    df, pl_classifier, scoring, rs_cv, X_train, y_train, param_name, param_range)\n\nplot_validation_curve(param_range, train_scores, test_scores, param_name, alpha=0.1)\n"

In [16]:
# Split training and testing data
X = train_vectors
y = df_train["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=10)

# Find an initial optimal n_estimators
classifier = XGBClassifier(
             learning_rate=0.1,
             n_estimators=1000,
             max_depth=5,
             min_child_weight=1,
             gamma=0,
             subsample=0.8,
             colsample_bytree=0.8,
             objective= 'reg:squarederror',
             nthread=4,
             scale_pos_weight=1,
             seed=27)
xgb_param = classifier.get_xgb_params()
xgtrain = xgboost.DMatrix(X, label=y)
cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=classifier.get_params()['n_estimators'], nfold=5,
            metrics='rmse', early_stopping_rounds=50)
classifier.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=0.8, gamma=0, gpu_id=None,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.1, max_delta_step=None, max_depth=5,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=509, n_jobs=None, nthread=4, num_parallel_tree=None,
       objective='reg:squarederror', random_state=None, reg_alpha=None,
       reg_lambda=None, scale_pos_weight=1, seed=27, subsample=0.8,
       tree_method=None, use_label_encoder=True, validate_parameters=None,
       verbosity=None)

In [83]:
# Tune tree specific parameters
tree_parameters = { 'max_depth': [3, 5, 10],
                    'min_child_weight': [0.1, 1, 5],
                    'gamma': [0.5, 1, 1.5, 2, 5],
                    'subsample': [0.5, 0.75, 1.0],
                    'colsample_bytree': [0.5, 0.75, 1.0]
                  }

scoring = make_scorer(f1_score)

rs_cv, X_train, X_test, y_train, y_test = find_best_estimator(
    classifier, scoring, X_train, X_test, y_train, y_test, **tree_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


KeyboardInterrupt: 

In [38]:
#Tuned randomized search best parameters: {'subsample': 0.75, 'min_child_weight': 0.1, 'max_depth': 3, 'gamma': 1, 'colsample_bytree': 1.0}
# Reduce min_child_weight from 1 to 0.1 reduces underfitting
# Reduce max_depth from 5 to 3 reduces overfitting
# Increase gamma from 0 to 1 requires more improvement for a split
# colsample_bytree from 0.8 to 1 requires all columns
# BEST SUBMISSION FOR XG BOOST - LATER ONES HAVE WORSE PERFORMANCE
classifier = XGBClassifier(
             learning_rate=0.1,
             n_estimators=509,
             max_depth=3,
             min_child_weight=0.1,
             gamma=1,
             subsample=0.75,
             colsample_bytree=1.0,
             objective= 'reg:squarederror',
             nthread=4,
             scale_pos_weight=1,
             seed=27)

In [39]:
# Make submission
classifier.fit(train_vectors, df_train["target"])

sample_submission = pd.read_csv('sample_submission.csv')

sample_submission['target'] = classifier.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv('submission.csv', index=False)

In [22]:
# Tune tree specific parameters
tree_parameters = { 'max_depth': [2, 3, 4],
                    'min_child_weight': [0, 0.1, 0.2],
                    'gamma': [0.8, 1, 1.2],
                    'subsample': [0.7, 0.8, 0.9],
                    'colsample_bytree': [1.0]
                  }

scoring = make_scorer(f1_score)

rs_cv, X_train, X_test, y_train, y_test = find_best_estimator(
    classifier, scoring, X_train, X_test, y_train, y_test, **tree_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 18.8min
[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed: 23.8min finished


Tuned randomized search best parameters: {'subsample': 0.8, 'min_child_weight': 0.1, 'max_depth': 3, 'gamma': 0.8, 'colsample_bytree': 1.0}
Training Report
              precision    recall  f1-score   support

           0       0.83      0.94      0.88      3711
           1       0.90      0.74      0.81      2760

   micro avg       0.85      0.85      0.85      6471
   macro avg       0.86      0.84      0.84      6471
weighted avg       0.86      0.85      0.85      6471



Testing Report
              precision    recall  f1-score   support

           0       0.76      0.90      0.82       631
           1       0.84      0.66      0.74       511

   micro avg       0.79      0.79      0.79      1142
   macro avg       0.80      0.78      0.78      1142
weighted avg       0.80      0.79      0.79      1142



In [26]:
#Tuned randomized search best parameters: {'subsample': 0.8, 'min_child_weight': 0.1, 'max_depth': 3, 'gamma': 0.8, 'colsample_bytree': 1.0}
classifier = XGBClassifier(
             learning_rate=0.1,
             n_estimators=736,
             max_depth=3,
             min_child_weight=0.1,
             gamma=0.8,
             subsample=0.8,
             colsample_bytree=1.0,
             objective= 'reg:squarederror',
             nthread=4,
             scale_pos_weight=1,
             seed=27)

In [25]:
# Find new optimal number of estimators
xgb_param = classifier.get_xgb_params()
xgtrain = xgboost.DMatrix(X, label=y)
cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=classifier.get_params()['n_estimators'], nfold=5,
            metrics='rmse', early_stopping_rounds=50)
classifier.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1.0, gamma=0.8, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=0.1, missing=nan, monotone_constraints='()',
       n_estimators=736, n_jobs=4, nthread=4, num_parallel_tree=1,
       objective='reg:squarederror', random_state=27, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=27, subsample=0.8,
       tree_method='exact', use_label_encoder=True, validate_parameters=1,
       verbosity=None)

In [28]:
# Make submission
classifier.fit(train_vectors, df_train["target"])

sample_submission = pd.read_csv('sample_submission.csv')

sample_submission['target'] = classifier.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv('submission.csv', index=False)

In [29]:
# Tune alpha
# Tune tree specific parameters
tree_parameters = { 'alpha': [1e-5, 1e-2, 0.1, 1, 100]
                  }

scoring = make_scorer(f1_score)

rs_cv, X_train, X_test, y_train, y_test = find_best_estimator(
    classifier, scoring, X_train, X_test, y_train, y_test, **tree_parameters)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  25 out of  25 | elapsed: 16.5min finished


Tuned randomized search best parameters: {'alpha': 1e-05}
Training Report
              precision    recall  f1-score   support

           0       0.84      0.94      0.88      3711
           1       0.90      0.75      0.82      2760

   micro avg       0.86      0.86      0.86      6471
   macro avg       0.87      0.84      0.85      6471
weighted avg       0.86      0.86      0.86      6471



Testing Report
              precision    recall  f1-score   support

           0       0.77      0.89      0.82       631
           1       0.83      0.67      0.74       511

   micro avg       0.79      0.79      0.79      1142
   macro avg       0.80      0.78      0.78      1142
weighted avg       0.80      0.79      0.79      1142



In [30]:
# Reduce learning rate and add trees
#Tuned randomized search best parameters: {'subsample': 0.75, 'min_child_weight': 0.1, 'max_depth': 3, 'gamma': 1, 'colsample_bytree': 1.0}
classifier = XGBClassifier(
             learning_rate=0.01,
             n_estimators=2500,
             max_depth=3,
             min_child_weight=0.1,
             gamma=0.8,
             subsample=0.8,
             colsample_bytree=1.0,
             alpha=1e-5,
             objective= 'reg:squarederror',
             nthread=4,
             scale_pos_weight=1,
             seed=27)

In [31]:
# Make submission
classifier.fit(train_vectors, df_train["target"])

sample_submission = pd.read_csv('sample_submission.csv')

sample_submission['target'] = classifier.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv('submission.csv', index=False)