In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# explore the algorithm wrapped by RFE
#from sklearn.feature_selection import RFE
#from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

#Calculate Accuracy
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn import preprocessing

#Stat test
from scipy.stats import f_oneway
from scipy.stats import ttest_ind

from pprint import pprint
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as stats

from matplotlib import pyplot

import matplotlib.pylab as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8

#This is similar to pd.DataFrame
import dask.dataframe as dd

#This is an API to call for local Dask Cluster
from dask.distributed import Client, LocalCluster


In [4]:
#Function to reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64','float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float32)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# feature selection with score_func indicated
def select_features(X_train, y_train, score_func):
    # configure to select all features
	fs = SelectKBest(score_func=score_func, k='all')
	# learn relationship from training data
	fs.fit(X_train, y_train)
	# transform train input data
	X_train_fs = fs.transform(X_train)
	
	return X_train_fs, fs
 

In [5]:
#Taking 2 samples from dataset for training and validation
train_sample = reduce_mem_usage(pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv", nrows=10000))
#train = reduce_mem_usage(pd.read_csv("train.csv", nrows=10000))
test_sample = reduce_mem_usage(pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv", nrows=10000))

In [6]:
#Base assumption is I can infer to the population with samples.
#Pull a sample from the data and sub sample to see if it makes 
#sense.

row = list()
means = list()
n=1000

for count in range(1,n):
        row.append(count)
        means.append(train_sample.sample(n=500).target.mean())
curve = pd.Series(means,index=row)

for count in range(1,n):
        row.append(count)
        means.append(train_sample.sample(n=100).target.mean())
curve2 = pd.Series(means,index=row)


In [7]:
#Check if train db outcome can be infer with test 
pop_mean = train_sample.target.mean()
stats.ttest_1samp(curve, pop_mean)

In [8]:
#null hypothesis is that they are equal, alternative is they are not.
def plot_distribution(inp, n=0):
    plt.figure()
    ax=sns.displot(inp)
    
    plt.axvline(np.mean(inp),color='k',linestyle='dashed', linewidth=5)
    _, max_ = plt.ylim()
    plt.text(inp.mean()+inp.mean()/10, max_- max_ / 10, "Mean: {:.2f}".format(inp.mean()),
            )
    plt.title(str(n) + 'samples')
    
    return plt.figure

plot_distribution(curve, 500)
plot_distribution(curve2, 100)

In [11]:
fig = plt.figure(figsize=(20,10))
sns.distplot(curve,hist=False, rug=True)
sns.distplot(curve2,hist=False, rug=True)
plt.axvline(np.mean(curve),color='green',linestyle='dashed',linewidth=3)
plt.axvline(np.mean(curve2),color='orange',linestyle='dashed',linewidth=3)
plt.show()

In [9]:
del curve, curve2

In [10]:
X = train_sample.drop(columns=['id','target']).select_dtypes(include='float16')
y = train_sample.target

# feature selection
X_train_fs, fs = select_features(X, y, mutual_info_classif)

# what are scores for the features
#for i in range(len(fs.scores_)):
#	print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()


In [11]:
#Feature list
mi_list  = list() 
for i in range(len(fs.scores_)):
    if (fs.scores_[i] > 0):
        mi_list.append(i)

mi_data = train_sample[X.columns[mi_list]]

In [12]:
mi_data.head()

In [14]:
# load the dataset
chi_train = train_sample.drop(columns=['id','target'])
chi_train = chi_train.select_dtypes(include=['int8','int32'])

# feature selection
X_train_fx, fx = select_features(chi_train, y,chi2)

# what are scores for the features
#for i in range(len(fx.scores_)):
#	print('Feature %d: %f' % (i, fx.scores_[i]))

# plot the scores
pyplot.bar([i for i in range(len(fx.scores_))], fx.scores_)
pyplot.show()

In [15]:
#Feature list
chi2_list  = list() 
for i in range(len(fx.scores_)):
    if (fx.scores_[i] > 0.1):
        chi2_list.append(i)
chi_data = train_sample[chi_train.columns[chi2_list]]

In [16]:
new_df = chi_data.join(mi_data)

In [17]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    
    dtrain_y = train_sample['target'].values
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain_y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], train_sample['target'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain_y, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(train_sample['target'], dtrain_predprob))
               
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    

In [18]:
trainb = new_df.copy()

target = 'target'
IDcol = 'id'
predictors = [x for x in trainb.columns if x not in ['target', 'id']]

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 #objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb1, trainb, predictors)


In [20]:
X = train_sample.loc[:,new_df.columns]
X_train, X_validation, y_train, y_validation = train_test_split(X, train_sample.target, test_size=0.25)
xgb1.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_validation, y_validation)], early_stopping_rounds=10) 

In [21]:
results = xgb1.evals_result()

plt.figure(figsize=(10,7))
plt.plot(results['validation_0']["logloss"], label="Training loss")
plt.plot(results['validation_1']["logloss"], label="Validation loss")
plt.axvline(xgb1.best_ntree_limit, color="gray", label="Optimal tree number")
plt.xlabel("Number of trees")
plt.ylabel("Loss")
plt.legend()

In [47]:
bntl = xgb1.best_ntree_limit

In [50]:
#Performing Coordinate Descent and tune depth and child weight
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,8,2)
}

gsearch1 = GridSearchCV(estimator = XGBClassifier(seed=27), param_grid = param_test1, scoring='roc_auc',n_jobs=-1, cv=5)

In [24]:
gsearch1.fit(trainb[predictors],train_sample[target], verbose=2)


In [None]:
print(gsearch1.best_params_,gsearch1.best_score_)

In [None]:
#Further tuning of depth and child weight
param_test2 = {
 'max_depth':[2,3,4],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=-1, cv=5)

gsearch2 = GridSearchCV(estimator = XGBClassifier(seed=27), param_grid = param_test2, scoring='roc_auc',n_jobs=-1, cv=5)


In [None]:
gsearch2.fit(trainb[predictors],train_sample[target], verbose=2)

In [None]:
gsearch2.best_params_

In [None]:
param_test2b = {
 'min_child_weight':[2,3,4,5]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier(max_depth=2,seed=27), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=4, cv=5)

gsearch2b.fit(trainb[predictors],train_sample[target], verbose=2)

In [None]:
gsearch2b.best_params_
gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,50)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(max_depth=2,
 min_child_weight=4, seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)

gsearch3.fit(trainb[predictors],train_sample[target], verbose=2)
#gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
print(gsearch3.best_params_, gsearch3.best_score_)

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(1,10)],
 'colsample_bytree':[i/10.0 for i in range(1,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier(max_depth=2,
 min_child_weight=4, gamma=4.9, seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(trainb[predictors],train_sample[target], verbose=2)


In [None]:
print(gsearch4.best_params_, gsearch4.best_score_)

In [None]:
param_test5 = {
 'subsample':[i/100.0 for i in range(85,95,1)],
 'colsample_bytree':[i/100.0 for i in range(5,15,1)]
}

gsearch5 = GridSearchCV(estimator = XGBClassifier(max_depth=2,
 min_child_weight=4, gamma=4.9, seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4, cv=5)

gsearch5.fit(trainb[predictors],train_sample[target], verbose=2)


In [None]:
print(gsearch5.best_params_, gsearch5.best_score_)

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = GridSearchCV(estimator = XGBClassifier(max_depth=2,
 min_child_weight=4, gamma=4.9, colsample_bytree=0.14,subsample=0.88,seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4, cv=5)

gsearch6.fit(trainb[predictors],train_sample[target], verbose=2)


In [None]:
print(gsearch6.best_params_, gsearch6.best_score_)

In [None]:
param_test7 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}

gsearch7 = GridSearchCV(estimator = XGBClassifier(max_depth=2,
 min_child_weight=4, gamma=4.9, colsample_bytree=0.14,subsample=0.88,seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4, cv=5)

gsearch7.fit(trainb[predictors],train_sample[target], verbose=2)

In [None]:
print(gsearch7.best_params_, gsearch7.best_score_)

In [None]:
#Tuning the lamda reduces accuracy
#param_test8 = {
# 'reg_lambda':[0.1, 1.0, 5.0, 10.0, 50.0, 100.0]
#}

#gsearch8 = GridSearchCV(estimator = XGBClassifier(max_depth=2,
# min_child_weight=4, gamma=4.9, colsample_bytree=0.95,
# subsample=0.26,reg_alpha= 0.05,seed=27), 
# param_grid = param_test8, scoring='roc_auc',n_jobs=4, cv=5)

#gsearch8.fit(trainb[predictors],train_sample[target], verbose=2)

In [22]:
xgb1 = XGBClassifier(
 learning_rate = 0.01,
 n_estimators = 1000,
 max_depth=2,
 min_child_weight=4, 
 gamma=4.9, 
 colsample_bytree=0.95,
 subsample=0.26,
 reg_alpha= 0.05,
 seed=27)

xgb1.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_validation, y_validation)], early_stopping_rounds=10) 
results = xgb1.evals_result()

plt.figure(figsize=(10,7))
plt.plot(results['validation_0']["logloss"], label="Training loss")
plt.plot(results['validation_1']["logloss"], label="Validation loss")
plt.axvline(xgb1.best_ntree_limit, color="gray", label="Optimal tree number")
plt.xlabel("Number of trees")
plt.ylabel("Loss")
plt.legend()


In [25]:
cluster = LocalCluster(n_workers = 2)
client = Client(cluster)

train_dask = dd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')#,blocksize=64e6)


In [26]:
train_dask.persist()
X = train_dask.loc[:,new_df.columns]
y = train_dask['target']

dtrain = xgb.dask.DaskDMatrix(client,X,y)

In [31]:
params = {
 'learning_rate' : 0.01,
 'max_depth': 2,
 'min_child_weight' : 4, 
 'gamma' : 4.9, 
 'colsample_bytree' : 0.95,
 'subsample' : 0.26,
 'reg_alpha': 0.05,
 'nthread' : 4
}

# train the model
#%%time 
output = xgb.dask.train(
    client, params, dtrain, num_boost_round = 1000,
    evals=[(dtrain, 'accuracy'),(dtrain, 'auc')], early_stopping_rounds = 10
)

booster = output['booster']  # booster is the trained model
history = output['history']  # A dictionary containing evaluation 


In [7]:
# save
#xgb1.save_model("my_xgboost.json")

# load
new_xgb = xgb.XGBClassifier()
new_xgb.load_model("./submission.csv/my_xgboost.json")

# check optimal number of trees of loaded model
#new_xgb.best_ntree_limit


In [34]:
test_dask = dd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
test_dask.persist()

test_id = test_dask['id']
test = test_dask.loc[:,new_df.columns]


In [2]:
preds = xgb1.dask.predict_proba(client, output, test)
y_test = preds.compute()
submission = pd.DataFrame(list(zip(test_id, y_test)), columns =['id', 'target'])
submission.to_csv('submission.csv', index=False)

In [36]:
client.close()