# Pipeline Template

In [None]:
%load_ext autoreload 
%autoreload 1

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.preprocessing import normalize
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA

from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler, MatrixFactorization

%aimport util.data

### Reading Dataset

In [None]:
train_data, test_data = util.data.load()

# Preprocessing

## Fill Missing Values (Remove NaNs)

use one of the methods to fill the missing values

In [None]:
X_incomplete = train_data.drop(["id", "y"], axis=1).values
y = train_data["y"].values

#### k-nearest neighbors

In [None]:
k = 6

# Use nearest rows which have a feature to fill in each row's missing features
X = KNN(k=k).fit_transform(X_incomplete)

#### singular values thresholding

In [None]:
# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X = SoftImpute().fit_transform(X_incomplete_normalized)

#### use matrix factorization

In [None]:
X = MatrixFactorization(learning_rate= 0.001, rank=40).fit_transform(X_incomplete)

#### mean

In [None]:
train_mean_values = train_data.mean()
train_data_mean =  train_data.fillna(train_mean_values)
X = train_data_mean.drop(["id", "y"], axis=1).values

## Data Normalization / Standardization

use one of the methods provided to normalize the data (expects no NaNs)

[Compare the effect of different scalers on data with outliers](http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)

#### L1 - Normalization 

In [None]:
X = normalize(X, axis=0, norm='l1')

#### L2 - Normalization 

In [None]:
X = normalize(X, axis=0, norm='l2')

#### Standard Scalar
removes mean and divides by std (**sensitive to outliers** => probably not a good idea)

In [None]:
X = StandardScaler().fit_transform(X)

#### Robust Scaler
Scale features using statistics that are robust to outliers. [scikit doc](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)



In [None]:
X = RobustScaler(quantile_range=(10, 90)).fit_transform(X)

## Outlier Detection and Removal

## Feature Selection

In [None]:
estimator = RandomForestRegressor(n_jobs=-1, n_estimators=50)

In [None]:
estimator = ExtraTreesRegressor(n_jobs=-1, n_estimators=50)

In [None]:
# Feature Selection with Recursive Feature Elimination with Cross Validation (RFECV)

n_rm_features_per_iteration = 4
cv_k = 3



rfecv = RFECV(estimator, step=n_rm_features_per_iteration, cv=cv_k, scoring='r2', verbose=1)

rfecv.fit(X, y)


In [None]:
print("Optimal number of features : %d" % rfecv.n_features_) #47
print(f"Validation Score: {max(rfecv.grid_scores_)}")

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, n_rm_features_per_iteration * len(rfecv.grid_scores_) + 1, n_rm_features_per_iteration), rfecv.grid_scores_)
plt.show()

selected_feature_cols = train_data.columns.values[1:-1][rfecv.support_]

print('Selected Features: ')
print(selected_feature_cols)

# Model Selection

#### K-Fold Cross Validation

In [None]:
# RandomForestRegressor

selected_feature_cols = ['x0', 'x42', 'x80', 'x82', 'x89', 'x94', 'x96', 'x120', 'x137', 'x178', 'x185',
 'x186', 'x192', 'x224', 'x234', 'x237', 'x264', 'x273', 'x280', 'x291', 'x300',
 'x309', 'x320', 'x328', 'x331', 'x333', 'x340', 'x349', 'x391', 'x400', 'x424',
 'x426', 'x449', 'x450', 'x470', 'x479', 'x499', 'x520', 'x529', 'x536', 'x547',
 'x555', 'x559', 'x604', 'x618', 'x644', 'x651', 'x658', 'x664', 'x673', 'x685',
 'x686', 'x687', 'x722', 'x730', 'x739', 'x743', 'x746', 'x751', 'x800', 'x810',
 'x871', 'x882']

In [None]:
#Extras Tree Regressor

selected_feature_cols_old = ['x0', 'x7', 'x60', 'x66', 'x80', 'x82', 'x88', 'x89', 'x94', 'x96', 'x117', 'x135',
 'x137', 'x142', 'x178', 'x185', 'x192', 'x200', 'x210', 'x224', 'x229', 'x230',
 'x234', 'x251', 'x273', 'x291', 'x297', 'x300', 'x309', 'x312', 'x333', 'x340',
 'x349', 'x363', 'x370', 'x374', 'x388', 'x391', 'x424', 'x426', 'x428', 'x449',
 'x450', 'x453', 'x470', 'x479', 'x490', 'x499', 'x504', 'x529', 'x547', 'x555',
 'x559', 'x560', 'x586', 'x591', 'x599', 'x604', 'x608', 'x609', 'x613', 'x618',
 'x622', 'x632', 'x636', 'x641', 'x643', 'x644', 'x651', 'x652', 'x664', 'x665',
 'x666', 'x673', 'x685', 'x686', 'x687', 'x716', 'x722', 'x730', 'x734', 'x746',
 'x751', 'x763', 'x789', 'x800', 'x803', 'x810', 'x838', 'x840', 'x844', 'x853',
 'x870', 'x871', 'x882']

selected_feature_cols = ['x0', 'x80', 'x82', 'x89', 'x96', 'x185', 'x192', 'x200', 'x224', 'x229', 'x273',
 'x291', 'x309', 'x333', 'x340', 'x349', 'x370', 'x374', 'x391', 'x426', 'x457',
 'x470', 'x479', 'x482', 'x499', 'x520', 'x529', 'x547', 'x555', 'x591', 'x599',
 'x604', 'x613', 'x651', 'x664', 'x673', 'x685', 'x686', 'x687', 'x716', 'x722',
 'x730', 'x746', 'x751', 'x803', 'x810', 'x823', 'x838', 'x853', 'x870', 'x882']

In [None]:
from sklearn.preprocessing import MinMaxScaler
k = 20

train_mean_values = train_data.mean()
train_data_mean =  train_data.fillna(train_mean_values)

X_sel = train_data_mean[selected_feature_cols].values


#X_sel_incomplete= train_data[selected_feature_cols].values
#X_sel =  KNN(k=k).fit_transform(X_sel_incomplete)
#X_sel = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_sel)
X_sel = RobustScaler(quantile_range=(10, 90)).fit_transform(X_sel)
#X_sel = StandardScaler().fit_transform(X_sel)
y_sel = train_data["y"].values


#pca = PCA(n_components=10)#
#pca.fit_transform(X_sel)
#eigenvalues = pca.explained_variance_
#plt.plot(eigenvalues[0:20])
#y_sel = y

estimator = MLPRegressor(#learning_rate='constant', 
                        #hidden_layer_sizes=(100),
                         activation='logistic', 
                        # learning_rate_init=0.0001, 
                         max_iter=15000, 
                         early_stopping =True,
                         validation_fraction=0.1,
                         tol=0.0000000000001,
                            #alpha=0.0001,
                         #n_iter_no_change=10,
                         verbose=True)

estimator = ExtraTreesRegressor(n_jobs=-1, n_estimators=20)

#estimator = RandomForestRegressor(n_jobs=-1, n_estimators=60)

estimator.fit(X_sel,y_sel)
#score = estimator.score(X_sel,y_sel)

score = cross_val_score((estimator), X_sel, y_sel, scoring='r2', cv=12)
print(score)
print(np.mean(score))


In [None]:
plt.scatter(range(y_sel.shape[0]), y_sel)

estimator.fit(X_sel,y_sel)

#X_test_sel_incomplete= test_data[selected_feature_cols].values
#X_test_sel =  KNN(k=k).fit_transform(X_test_sel_incomplete)


test_mean_values = test_data.mean()
test_data_mean =  test_data.fillna(test_mean_values)
X_test_sel = test_data_mean[selected_feature_cols].values
X_test_sel = RobustScaler(quantile_range=(10, 90)).fit_transform(X_test_sel)

y_pred = estimator.predict(X_test_sel)

print(y_pred)

In [None]:
print(min(y_pred))
print(max(y_pred))

x = range(len(y_pred))
plt.scatter(x, np.sort(y_pred))

temp = train_data.sort_values(by=['y']).reset_index(drop=True)
x = range(len(temp["y"]))
plt.scatter(x, temp["y"])

In [None]:
X_sel_incomplete= train_data[selected_feature_cols].values

train_mean_values = train_data.mean()
train_data_mean =  train_data.fillna(train_mean_values)
X_sel = train_data_mean[selected_feature_cols].values


#k = 10
X_sel =  KNN(k=k).fit_transform(X_sel_incomplete)

y_sel = train_data["y"].values

X_sel = RobustScaler(quantile_range=(10, 90)).fit_transform(X_sel)

score = cross_val_score(RandomForestRegressor(n_jobs=-1, n_estimators=200), X_sel, y_sel, scoring='r2', cv=12)
print(score.mean())


#### Bayesian Optimization

In [None]:
def svr_model(gamma, C, epsilon):
    score = cross_val_score(
                SVR(gamma=gamma, C=C, epsilon=epsilon, kernel='poly'), 
                X, y, scoring='r2').mean()
    #score = np.array(score)
    return score  

In [None]:
# Bayesian Optimization

from bayes_opt import BayesianOptimization

bo = BayesianOptimization(svr_model,{'gamma': (0.01, 5.0), 'C': (0.1, 100), 'epsilon': (0.0001, 1)})

# Once we are satisfied with the initialization conditions
# we let the algorithm do its magic by calling the maximize()
# method.
bo.maximize(init_points=5, n_iter=15, kappa=10)

# The output values can be accessed with self.res
print(bo.res['max'])


In [None]:
# Official Metric
score = r2_score(y, y_pred)

## Evaluation

## Submission

In [None]:
test_data["y"] = y_pred
util.data.write_submission(test_data, "nku")