In [83]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from vecstack import stacking
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [77]:
y = train['target'] 
x = train.drop(["ID_code", "target"], axis=1)

In [79]:
training, valid, ytraining, yvalid = train_test_split(x, y, test_size = 0.25)

In [7]:
param = {
        'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
    }
d_train = lgb.Dataset(training, label=ytraining)

In [9]:
model1 = RandomForestClassifier()
model2 = GaussianNB()
mlp = MLPClassifier(max_iter=100)

In [10]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [11]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(training, ytraining)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant', 'adaptive']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.905 (+/-0.004) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.908 (+/-0.003) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.904 (+/-0.001) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.907 (+/-0.004) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.904 (+/-0.003) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.908 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver':

In [13]:
model1.fit(training, ytraining)
model2.fit(training, ytraining)
lgbm = lgb.train(params_tuned, d_train, 100)



In [14]:
preds1 = model1.predict(valid)
preds2 = model2.predict(valid)
preds3 = clf.predict(valid)
preds4 = lgbm.predict(valid)

In [65]:
print(preds3)

[0 0 0 ... 0 0 0]


In [25]:
test1 = test.drop(["ID_code"], axis=1)
test_preds1 = model1.predict(test1)
test_preds2 = model2.predict(test1)
test_preds3 = clf.predict(test1)
test_preds4 = lgbm.predict(test1)

In [60]:
print(test_preds3)

[0 0 0 ... 0 0 0]


In [16]:
stacked_predictions = np.column_stack((preds1, preds2, preds3, preds4))

In [58]:
print(stacked_predictions)

[[0.         0.         0.         0.23776937]
 [0.         0.         0.         0.24218211]
 [0.         0.         0.         0.23642642]
 ...
 [0.         0.         0.         0.24551201]
 [0.         0.         0.         0.2476628 ]
 [0.         0.         0.         0.25669693]]


In [26]:
stacked_test_predictions = np.column_stack((test_preds1, test_preds2, test_preds3, test_preds3))

In [59]:
print(stacked_test_predictions)

[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 ...
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [66]:
meta_model = MLPClassifier()

In [67]:
training1, valid1, ytraining1, yvalid1 = train_test_split(stacked_predictions, yvalid, test_size = 0.25)

In [68]:
meta_model.fit(training1, ytraining1)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [69]:
predictions_validSet = meta_model.predict(valid1)

In [70]:
print(predictions_validSet)

[0 0 0 ... 0 0 1]


In [71]:
ras = roc_auc_score(predictions_validSet, yvalid1)
print(ras)

0.8377923616807947


In [72]:
final_predictions = meta_model.predict(stacked_test_predictions)

In [73]:
final_predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [74]:
print('Saving the Submission File')
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = final_predictions
sub.to_csv('submission_9th_April.csv', index=False)

Saving the Submission File


In [None]:
y2 = train['target'] 
x2 = train.drop(["ID_code", "target"], axis=1)

In [None]:
import warnings
warnings.filterwarnings('ignore')
rs = StratifiedKFold(n_splits=5, random_state=None)

In [None]:
# parameter_space = {
#    'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
#                                                   'activation': ['tanh', 'relu'],
#                                                   'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.05], 
#                                                   'learning_rate': ['constant']
# }

In [124]:
classifiers = {'Logistic Regression':LogisticRegression()}
#                'Random Forest Classifier': RandomForestClassifier(),
#                  'Neural Network' : MLPClassifier(),
#               'Gradient Boosting Classifier':GradientBoostingClassifier(),
#                'Adaptive Boosting Classifier':AdaBoostClassifier()}

# log_cols = ["roc-auc_Score"]
# #metrics_cols = []
# log = pd.DataFrame(columns=log_cols)

In [128]:
rs.get_n_splits(training,ytraining)
for Name,classify in classifiers.items():
    for train_index, test_index in rs.split(training,ytraining):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X,X_test = training.iloc[train_index], training.iloc[test_index]
        y,y_test = ytraining.iloc[train_index], ytraining.iloc[test_index]
        cls = classify
        cls =cls.fit(X,y)
        
    y_out = cls.predict(valid)
    Y_out = cls.predict(test1)
    stacked_predictions1 = np.column_stack((y_out))
    stacked_test_predictions1 = np.column_stack((Y_out))
        
    roc_auc = roc_auc_score(y_out,yvalid)
#         log_entry = pd.DataFrame([[roc_auc]], columns=log_cols)
        
#         log = log.append(log_entry)
        
# print(log)
print(stacked_predictions1)
print(stacked_test_predictions1)

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


In [130]:
len(stacked_predictions1)

1

In [116]:
param = {
        'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [117]:
df = train
nfold = 11
target = 'target'
predictors = df.columns.values.tolist()[2:] 

In [119]:
skf = StratifiedKFold(n_splits=nfold, shuffle=False, random_state=2719)

oof = np.zeros(len(df))
predictions = np.zeros(len(df))

i = 1
for train_index, valid_index in skf.split(df, df.target.values):
    print("\nfold {}".format(i))
    xg_train = lgb.Dataset(df.iloc[train_index][predictors].values,
                           label=df.iloc[train_index][target].values,
                           feature_name=predictors)
                        
                           
    xg_valid = lgb.Dataset(df.iloc[valid_index][predictors].values,
                           label=df.iloc[valid_index][target].values,
                           feature_name=predictors)
                           
                             

    nround = 1000000
    clf = lgb.train(param, xg_train, nround, valid_sets = [xg_valid], verbose_eval=5000,early_stopping_rounds = 4000)
    oof[valid_index] = clf.predict(df.iloc[valid_index][predictors].values, num_iteration=nround) 
    
    predictions += clf.predict(df[predictors], num_iteration=nround) / nfold
    i = i + 1

print("\n\nCV AUC: {:<0.4f}".format(metrics.roc_auc_score(df.target.values, oof)))


fold 1
Training until validation scores don't improve for 4000 rounds.
[5000]	valid_0's auc: 0.898056
[10000]	valid_0's auc: 0.900623
[15000]	valid_0's auc: 0.901141
Early stopping, best iteration is:
[14826]	valid_0's auc: 0.901212

fold 2
Training until validation scores don't improve for 4000 rounds.
[5000]	valid_0's auc: 0.896843
[10000]	valid_0's auc: 0.89824
[15000]	valid_0's auc: 0.897862
Early stopping, best iteration is:
[12398]	valid_0's auc: 0.89828

fold 3
Training until validation scores don't improve for 4000 rounds.
[5000]	valid_0's auc: 0.890566
[10000]	valid_0's auc: 0.89323
[15000]	valid_0's auc: 0.892995
Early stopping, best iteration is:
[12375]	valid_0's auc: 0.893433

fold 4
Training until validation scores don't improve for 4000 rounds.
[5000]	valid_0's auc: 0.900415
[10000]	valid_0's auc: 0.90206
[15000]	valid_0's auc: 0.902308
Early stopping, best iteration is:
[13993]	valid_0's auc: 0.902384

fold 5
Training until validation scores don't improve for 4000 roun

NameError: name 'metrics' is not defined

In [120]:
predictions

array([0.01081147, 0.37433181, 0.00676112, ..., 0.06206862, 0.06215014,
       0.00692393])

In [123]:
len(predictions)

200000

In [132]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, random_state=None)
# x is the feature set and y is the target
for train_index, test_index in skf.split(x_train,y_train):
    x_train1, x_test1 = x_train.iloc[train_index], x_train.iloc[test_index] 
    y_train1, y_test1 = y_train.iloc[train_index], y_train.iloc[test_index]
    model1=GaussianNB()
    model1.fit(x_train1, y_train1)
    pred1 = model1.predict_proba(x_test)
#     test_preds1=model1.predict_proba(xt)

In [133]:
pred1

array([[0.99729011, 0.00270989],
       [0.99566634, 0.00433366],
       [0.99230091, 0.00769909],
       ...,
       [0.94644766, 0.05355234],
       [0.94621528, 0.05378472],
       [0.99735127, 0.00264873]])

In [134]:
len(pred1)

39999