In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import SGDRegressor
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale, RobustScaler
import gc
from skimage.restoration import denoise_wavelet
import scipy.signal as signal
import scipy.stats as stats
import time
import itertools
from sklearn.preprocessing import PolynomialFeatures

pd.set_option("max_columns", 200)
pd.set_option("max_rows", 200)
gc.enable()

In [None]:
def mat_cor(y_true, y_pred):
    assert y_true.shape[0] == y_pred.shape[0]
    
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    numerator = (tp * tn - fp * fn) 
    denominator = ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** .5

    return numerator / (denominator + 1e-15)

In [None]:
def read_metadata():
    train_metadata = pd.read_csv('../input/metadata_train.csv')
    test_metadata = pd.read_csv('../input/metadata_test.csv')
    return (train_metadata, test_metadata)

def resample_train():
    data = pq.read_pandas('../input/train.parquet').to_pandas().transpose()
    target = read_metadata()[0]['target'].values
    data['target'] = target
    p_indices = data[data.target == 0].index
    np.random.seed(311)
    random_indices = np.random.choice(p_indices, 1777, replace=False)
    df = pd.concat([data.loc[random_indices][['target']], 
                    data[target == 1][['target']]]).sample(frac=1.0, random_state=311)
    df.to_csv('train_us_target.csv', index=False)
    return df.index


In [None]:
tr_limits = []
start = 0
end = 8712
while True:
    if (start+1000) <= 8712:
        tr_limits.append((start, start+1000))
        start=start+1000
    else:
        tr_limits.append((start, end))
        break
        
ts_limits = []
start = 8712
end = 29049
while True:
    if (start+1000) <= 29049:
        ts_limits.append((start, start+1000))
        start=start+1000
    else:
        ts_limits.append((start, end))
        break

In [None]:
train_metadata, test_metadata = read_metadata()
# resampling_index = resample_train()

In [None]:
tr_v1 = pd.read_csv('final_v1_tr.csv')
tr_v2 = pd.read_csv('final_v2_trs.csv')
# tr_v3 = pd.read_csv('final_v3_tr.csv')
# tr_v4 = pd.read_csv('final_v4_tr.csv')

ts_v1 = pd.read_csv('final_v1_ts.csv')
ts_v2 = pd.read_csv('final_v2_tss.csv')
# ts_v3 = pd.read_csv('final_v3_ts.csv')
# ts_v4 = pd.read_csv('final_v4_ts.csv')

tr_v2.drop(['std_peak_width', 'skew_peak_width', 'std_peak_prom', 'skew_peak_prom'], axis=1, inplace=True)
ts_v2.drop(['std_peak_width', 'skew_peak_width', 'std_peak_prom', 'skew_peak_prom'], axis=1, inplace=True)

In [None]:
tr_v3 = tr_v1 - tr_v2
ts_v3 = ts_v1 - ts_v2
feat_names = ['h_bt_8_dist_bt_5', 'h_bt_8_dist_bt_1000', 'height_more_15', 'height_more_8', 'h_bt_3_w_bt_5', 
              'threshold_more_16', 'h_bt_5_dist_bt_5', 'h_bt_8_dist_bt_11111', 'sg_skew', 
              'h_bt_5_dist_bt_11111', 'h_bt_8_dist_bt_7', 'max_peak_prom', 'sg_min', 'h_bt_3_dist_bt_11111', 
              'sg_mean', 'sg_std', 'h_bt_5_dist_bt_75', 'h_bt_3_dist_bt_1000', 'h_bt_3_dist_bt_75', 
              'min_peak_prom', 'max_peak_width', 'h_bt_3_dist_bt_7', 'sg_max', 'h_bt_3_w_bt_10', 
              'height_more_3', 'h_bt_3_dist_bt_25', 'h_bt_5_dist_bt_111', 'min_peak_width', 
              'threshold_more_3', 'height_more_5', 'h_bt_5_dist_bt_1000', 'threshold_more_10', 
              'h_bt_5_dist_bt_25', 'h_bt_10_dist_bt_1000']

tr_v1 = tr_v1[feat_names]
ts_v1 = ts_v1[feat_names]
tr_v2 = tr_v2[feat_names]
ts_v2 = ts_v2[feat_names]
tr_v3 = tr_v3[feat_names]
ts_v3 = ts_v3[feat_names]

tr_v2.columns = ['ng-df_' + x for x in tr_v2.columns.values]
# tr_v3.columns = ['abs-df_' + x for x in tr_v3.columns.values]
# tr_v4.columns = ['tsfresh_' + x for x in tr_v4.columns.values]
tr_v3.columns = ['sub-df_' + x for x in tr_v3.columns.values]
ts_v3.columns = ['sub-df_' + x for x in ts_v3.columns.values]

ts_v2.columns = ['ng-df_' + x for x in ts_v2.columns.values]
# ts_v3.columns = ['abs-df_' + x for x in ts_v3.columns.values]
# ts_v4.columns = ['tsfresh_' + x for x in ts_v4.columns.values]

In [None]:
tr_v3.head()

In [None]:
tr_list = [tr_v1, tr_v2, tr_v3]#, tr_v4]
ts_list = [ts_v1, ts_v2, ts_v3]#, ts_v4]

for df in tr_list+ts_list:
    print(df.shape)

In [None]:
tr = pd.concat(tr_list, axis=1)
ts = pd.concat(ts_list, axis=1)

In [None]:
# poly_feat = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# tr_poly = poly_feat.fit_transform(tr)
# ts_poly = poly_feat.fit_transform(ts)
# tr_poly = pd.DataFrame(tr_poly)
# ts_poly = pd.DataFrame(ts_poly)
# print(tr_poly.shape, ts_poly.shape)

# tr_sub = tr_v1 - tr_v2
# ts_sub = ts_v1 - ts_v2   
# print(tr_sub.shape, ts_sub.shape)

# tr = pd.concat([tr_poly, tr_sub], axis=1)
# ts = pd.concat([ts_poly, ts_sub], axis=1)
# print(tr.shape, ts.shape)
# del tr_poly, ts_poly, tr_sub, ts_sub

In [None]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return ('matthews', mat_cor(labels, preds), True)

train_data = lgb.Dataset(tr, label=train_metadata['target'])
params={'learning_rate': 0.1, 'objective':'binary', 'metric':'None', 
        'num_leaves': 777, 'verbose': 1, 'seed':311, 'max_depth': 11,
        'bagging_fraction': 0.7, 'feature_fraction': 1.0, 
        'feature_fraction_seed': 311, 'min_data_in_leaf': 33, 
        'is_unbalance': True}#, 'lambda_l1': 500, 'histogram_pool_size': 6000}
num_round = 15000
light = lgb.train(params, train_data, num_round, feval=evalerror)
light_pred = light.predict(ts)

light_pred = np.zeros(ts.shape[0])
for i in range(1,4):
    params['bagging_fraction'] = 1.0 - (i/10)
    params['seed'] = i*110
    params['learning_rate'] = 0.03 * i + 0.01
    params['max_depth'] = 9 + i*2
    params['num_leaves'] = 553 + 100*i
    light = lgb.train(params, train_data, num_round, feval=evalerror)
    light_pred += light.predict(ts)
light_pred /= 3

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(light_pred, bins=100);
print(pd.Series(np.where(light_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(tr, label=train_metadata['target'])
test_pool = Pool(ts) 

cat = CatBoostClassifier(random_seed=77)
cat.fit(train_pool)
cat_pred = cat.predict_proba(test_pool)[:,1]

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(cat_pred, bins=100);
print(pd.Series(np.where(cat_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
import xgboost

dtrain = xgboost.DMatrix(tr, label=train_metadata['target'])
dtest = xgboost.DMatrix(ts)
params = {"nthread": 4, "seed": 3, "subsample": 0.9, "reg_lambda": 11, "reg_alpha": 11, 
          "learning_rate": 0.15, "gamma": 0, "colsample_bytree": 0.8, 
          "colsample_bylevel": 0.9, "max_depth": 50, "objective": "binary:logistic",
          'min_child_weight': 33} #hinge
num_round = 3000

xgb_pred = np.zeros(ts.shape[0])
for i in range(1,4):
    params['subsample'] = 1.0 - i/10
    params['seed'] = i*19
    params['learning_rate'] = 0.005 + i/1000 
    params['min_child_weight'] = 11*i
    xgb = xgboost.train(params, dtrain, num_round)
    xgb_pred += xgb.predict(dtest)
xgb_pred /= 3


In [None]:
plt.figure(figsize=(10, 12))
plt.hist(xgb_pred, bins=100);
print(pd.Series(np.where(xgb_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

scaler = StandardScaler()
scaler.fit(tr)

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='ball_tree', leaf_size=30, 
                           p=2, metric='minkowski', metric_params=None, n_jobs=None)

knn_bg = BaggingClassifier(base_estimator=knn, n_estimators=3, max_samples=0.8, 
                      max_features=1.0, bootstrap=True, bootstrap_features=False, 
                      oob_score=False, warm_start=False, n_jobs=1, 
                      random_state=7, verbose=3)

knn_bg.fit(scaler.transform(tr), train_metadata['target'])
knn_pred = knn_bg.predict_proba(scaler.transform(ts))[:,1]
print(knn_bg.classes_)
knn_pred.shape

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(knn_pred, bins=100);
print(pd.Series(np.where(knn_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(146, 146, 146), 
                   activation="relu", solver="adam", alpha=1e-7, 
                   batch_size=128, learning_rate="constant", learning_rate_init=0.001, power_t=0.5, 
                   max_iter=5000, shuffle=True, random_state=2, tol=0.0001, verbose=False, 
                   warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                   validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)

nn_pred = np.zeros(ts.shape[0])
for i in range(1,4):
    nn.set_params(batch_size = 108 + i*10)
    nn.set_params(learning_rate_init = 0.0005 + i/2000)
    nn.set_params(alpha = i * 1e-7)
    nn.fit(scaler.transform(tr), train_metadata['target'])
    nn_pred += nn.predict_proba(scaler.transform(ts))[:,1]
    print(nn_pred, nn_pred.shape)
nn_pred /= 3
print(nn.classes_)
nn_pred

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(nn_pred, bins=100);
print(pd.Series(np.where(nn_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                        class_weight=None, random_state=11, solver='liblinear', max_iter=1000, 
                        multi_class='ovr', verbose=0, warm_start=False, n_jobs=4)

lr_pred = np.zeros(ts.shape[0])
for i in range(1,4):
    lr.set_params(random_state = 11 + i)
    lr.set_params(C = 1.0 * i)
    lr.fit(tr, train_metadata['target'])
    lr_pred += lr.predict_proba(ts)[:,1]
lr_pred /= 3
lr.classes_

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(lr_pred, bins=100);
print(pd.Series(np.where(lr_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb_bg = BaggingClassifier(base_estimator=gnb, n_estimators=3, max_samples=0.8, 
                      max_features=1.0, bootstrap=True, bootstrap_features=False, 
                      oob_score=False, warm_start=False, n_jobs=1, 
                      random_state=7, verbose=3)

gnb_bg.fit(tr, train_metadata['target'])
gnb_pred = gnb_bg.predict_proba(ts)[:,1]
gnb_bg.classes_

In [None]:
plt.figure(figsize=(10, 12))
plt.hist(gnb_pred, bins=100);
print(pd.Series(np.where(gnb_pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
preds = [light_pred, cat_pred, xgb_pred, knn_pred, nn_pred, lr_pred, gnb_pred]
weights = [3, 1, 1, 1, 1, 1, 1]
pred = np.zeros(ts.shape[0])
for i in range(len(preds)):
    pred += weights[i] * preds[i]
pred = pred / np.sum(weights)

In [None]:
plt.figure(figsize=(10,17))
plt.hist(pred, bins=100);
print(pd.Series(np.where(pred > 0.51, 1, 0).ravel()).value_counts())

In [None]:
pred = np.where(pred > 0.51, 1, 0)

In [None]:
submission = pd.DataFrame({
        "signal_id": test_metadata['signal_id'],
        "target": pred
})

submission.to_csv('submission.csv', index=False)

In [None]:
def plot_feat_importances(feature_names, fi, figsize=(12,8), color="blue"):
    feature_importances = fi
    feature_importances = pd.Series(
        feature_importances, index=feature_names
        ).sort_values(ascending=False).iloc[:100]
    fig, ax = plt.subplots(figsize=figsize)
    sns.barplot(x=feature_importances, 
                y=feature_importances.index, 
                color=color);
    plt.xlabel('Feature Importance');
    plt.ylabel('Feature');
    print(feature_importances.head(25).index)
feature_importances = light.feature_importance()
feature_names = ts.columns.values
plot_feat_importances(feature_names, feature_importances, figsize=(12, 30))

In [None]:
val = tr.copy()
val['target'] = train_metadata['target']
val = val.sample(frac=0.7, replace=True, random_state=2)
tr_pred = light.predict(val.drop('target', axis=1))
tr_pred = np.where(tr_pred > 0.8, 1, 0)
mat_cor(val['target'], tr_pred)

In [None]:
# pq.read_metadata('../input/test.parquet')

## 