In [1]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib
from matplotlib import pyplot as plt

import scipy
from scipy.stats import zscore

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold
from sklearn.cross_decomposition import PLSRegression

try:
    import seaborn as sns
    use_seaborn = True
    sns.set()
except:
    use_seaborn = False

showfig = False
delOutliers = False
downSizing  = False

In [2]:
#load data 
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv",header=None,names =['shares'])

if showfig:
    fig = plt.figure(figsize=(6.4*2,4.8*6))
    gs  = fig.add_gridspec(nrows=12, ncols=5)
    for (i,header) in enumerate(X1.columns):
        ax = fig.add_subplot(gs[int(i/5),i%5])
        ax.scatter(X1[header],Y1.values, s=5)
        ax.set_xlabel(header)
    fig.tight_layout()

X1_val = X1.values
Y1_val = Y1.values

X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1_val, Y1_val,random_state=1, test_size=0.2)

X1_val = X1_train
Y1_val = Y1_train

In [3]:
if delOutliers:
#Removing outliers
    z_scores = zscore(X1_val)
    abs_z_scores = np.abs(z_scores)
    X1_filtered_ind = (abs_z_scores < 4).all(axis=1)
    X1_val = X1_val[X1_filtered_ind]
    Y1_val = Y1_val[X1_filtered_ind]

In [4]:
if downSizing:
    # corr = np.corrcoef(X1_train)
    # corr = np.triu(corr) - np.eye(corr.shape[0]) #np.diag(np.diag(corr))
    # delt = np.where(corr>1-1e-7)

    # keep = np.where([idx not in delt[0] for idx in range(corr.shape[0])])[0]
    # print(X1_val.shape[0], len(keep))

    # X1_train,Y1_train = X1_train[keep,:],Y1_train[keep,:]
    
    corr = np.corrcoef(X1_val.transpose())
    corr = np.triu(corr) - np.eye(corr.shape[0]) #np.diag(np.diag(corr))
    delt = np.where(corr>1-1e-1)

    keep = np.where([idx not in delt[0] for idx in range(corr.shape[0])])[0]
    print(X1_val.shape[1], len(keep))

    X1_val = X1_val[:,keep]

In [5]:
#score computation
def scoref1(ytrue, ypred, th):
    return sklearn.metrics.f1_score(ytrue>th, ypred>th)

def scoreregression(ytrue, ypred):
    scores = [
        scoref1(ytrue, ypred, th=th) for th in [ 500, 1400, 5000, 10000]
    ]
    return np.mean(scores)

In [6]:
#feature selections
def correlation_selection(X,Y,nb):
    data = np.concatenate((np.transpose(X),np.transpose(Y)))
    corr = np.corrcoef(data)[:,-1]
    corr_bis=corr[:-1]
    idxs = np.argpartition(corr_bis, -nb)[-nb:]
    X_corr = X[:,idxs]
    return X_corr

def PCA_selection(X,nb):
    # To do 1: initiate StandardScaler class
    scaler = StandardScaler(copy=True,with_mean=True,with_std=True)
    # Initialize PCA
    pca = PCA(n_components=nb)
    X_pca = pca.fit_transform(scaler.fit_transform(X))
    
    return X_pca

def mutual_info_selection(X,Y,nb):
    mutual_information = mutual_info_regression(X,Y)
    idxs = np.argpartition(mutual_information,-nb)[-nb:]
    X_mutual_info = X[:,idxs[:]]

    return X_mutual_info

In [7]:
X_correlation_10=correlation_selection(X1_val,Y1_val,10)
X_correlation_12=correlation_selection(X1_val,Y1_val,12)
X_correlation_15=correlation_selection(X1_val,Y1_val,15)
X_correlation_17=correlation_selection(X1_val,Y1_val,17)
X_correlation_20=correlation_selection(X1_val,Y1_val,20)

X_correlation=[X_correlation_10,X_correlation_12,X_correlation_15,X_correlation_17,X_correlation_20]

In [8]:
X_pca_10=PCA_selection(X1_val,10)
X_pca_12=PCA_selection(X1_val,12)
X_pca_15=PCA_selection(X1_val,15)
X_pca_17=PCA_selection(X1_val,17)
X_pca_20=PCA_selection(X1_val,20)

X_pca=[X_pca_10,X_pca_12,X_pca_15,X_pca_17,X_pca_20]

In [9]:
"""
X_mutual_info_10=mutual_info_selection(X1_val,Y1_val,10)
X_mutual_info_12=mutual_info_selection(X1_val,Y1_val,12)
X_mutual_info_15=mutual_info_selection(X1_val,Y1_val,15)
X_mutual_info_17=mutual_info_selection(X1_val,Y1_val,17)
X_mutual_info_20=mutual_info_selection(X1_val,Y1_val,20)

X_mutual_info=[X_mutual_info_10,X_mutual_info_12,X_mutual_info_15,X_mutual_info_17,X_mutual_info_20]
"""

'\nX_mutual_info_10=mutual_info_selection(X1_val,Y1_val,10)\nX_mutual_info_12=mutual_info_selection(X1_val,Y1_val,12)\nX_mutual_info_15=mutual_info_selection(X1_val,Y1_val,15)\nX_mutual_info_17=mutual_info_selection(X1_val,Y1_val,17)\nX_mutual_info_20=mutual_info_selection(X1_val,Y1_val,20)\n\nX_mutual_info=[X_mutual_info_10,X_mutual_info_12,X_mutual_info_15,X_mutual_info_17,X_mutual_info_20]\n'

In [10]:
def linear_regression(X,Y,f_selection):
    X_corr=f_selection
    nb_features=len(X_corr[0])

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_corr):
        
        X1_train_corr=X_corr[train]
        X1_test_corr=X_corr[test]
        Y1_train_corr=Y[train]
        Y1_test_corr=Y[test]

        regr = linear_model.LinearRegression()
        regr.fit(X1_train_corr, Y1_train_corr)
        Y1_pred = regr.predict(X1_test_corr)
    
        Scores.append(scoreregression(Y1_test_corr,Y1_pred))

    print(np.mean(Scores),"nb_feat:",nb_features,sep=" ")

In [11]:
for i in X_pca:   #here you choose which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
    linear_regression(X1_val,Y1_val,i)

0.4652507987002097 nb_feat: 10
0.4737156732717459 nb_feat: 12
0.47639723713135884 nb_feat: 15
0.4764154336145602 nb_feat: 17
0.47705964952393537 nb_feat: 20


In [12]:
def knn_regression(X,Y,f_selection,nb_neigh):
    X_knn=f_selection
    nb_features=len(X_knn[0])

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_knn):
        
        X1_train_knn=X_knn[train]
        X1_test_knn=X_knn[test]
        Y1_train_knn=Y[train]
        Y1_test_knn=Y[test]

        knn = KNeighborsRegressor(nb_neigh)
        knn.fit(X1_train_knn, Y1_train_knn)
        Y1_pred = knn.predict(X1_test_knn)

        Scores.append(scoreregression(Y1_test_knn,Y1_pred))

    print(np.mean(Scores),"nb_feat:",nb_features,"nb_neighb:",nb_neigh,sep=" ")

In [13]:

neighbours=[8,9,10,11,12,13,14,15,16,17,18,19,20]

for i in neighbours:
    for j in X_pca:   #here you choose which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
        knn_regression(X1_val,Y1_val,j,i)


0.48335588127574225 nb_feat: 10 nb_neighb: 8
0.48984924677501496 nb_feat: 12 nb_neighb: 8
0.48461319909121003 nb_feat: 15 nb_neighb: 8
0.4877117005951764 nb_feat: 17 nb_neighb: 8
0.48070895955858095 nb_feat: 20 nb_neighb: 8
0.4861606948633911 nb_feat: 10 nb_neighb: 9
0.4884196299588255 nb_feat: 12 nb_neighb: 9
0.4878562394146158 nb_feat: 15 nb_neighb: 9
0.4902076009284463 nb_feat: 17 nb_neighb: 9
0.48222786350599806 nb_feat: 20 nb_neighb: 9
0.48514350959039537 nb_feat: 10 nb_neighb: 10
0.48948343019356216 nb_feat: 12 nb_neighb: 10
0.4882160028618171 nb_feat: 15 nb_neighb: 10
0.4870568528294121 nb_feat: 17 nb_neighb: 10
0.4833477761785992 nb_feat: 20 nb_neighb: 10
0.48539212325948533 nb_feat: 10 nb_neighb: 11
0.4884761863433873 nb_feat: 12 nb_neighb: 11
0.48722560507646495 nb_feat: 15 nb_neighb: 11
0.4872793724932659 nb_feat: 17 nb_neighb: 11
0.48138073618942656 nb_feat: 20 nb_neighb: 11
0.48447196850264895 nb_feat: 10 nb_neighb: 12
0.48695248694505117 nb_feat: 12 nb_neighb: 12
0.486204

In [14]:
def mlp_regression(X,Y,f_selection,layers,learning_r):
    X_mlp=f_selection
    nb_features=len(X_mlp[0])

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_mlp):
        
        X1_train_mlp=X_mlp[train]
        X1_test_mlp=X_mlp[test]
        Y1_train_mlp=Y[train]
        Y1_test_mlp=Y[test]

        reg = MLPRegressor(hidden_layer_sizes=layers,activation="relu",learning_rate=learning_r,max_iter=200)
        reg.fit(X1_train_mlp, Y1_train_mlp)
        Y1_pred=reg.predict(X1_test_mlp)

        Scores.append(scoreregression(Y1_test_mlp,Y1_pred))

    print(np.mean(Scores),"nb_feat:",nb_features,"layers:",layers,"lear_rate:",learning_r,sep=" ")


In [15]:
"""
layers=[(10),(12),(15),(10,10),(12,12),(15,15)]
learning_r = ["constant","invscaling","adaptive"]

for i in layers:
    for j in X_mutual_info: #here you choose which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
        for k in learning_r:
            mlp_regression(X1_val,Y1_val,j,i,k)
"""

'\nlayers=[(10),(12),(15),(10,10),(12,12),(15,15)]\nlearning_r = ["constant","invscaling","adaptive"]\n\nfor i in layers:\n    for j in X_mutual_info: #here you choose which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)\n        for k in learning_r:\n            mlp_regression(X1_val,Y1_val,j,i,k)\n'

In [22]:
def pls_regression(X,Y,f_selection,nb_comp):
    X_pls=f_selection
    nb_features=len(X_pls[0])

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_pls):
        
        X1_train_pls=X_pls[train]
        X1_test_pls=X_pls[test]
        Y1_train_pls=Y[train]
        Y1_test_pls=Y[test]

        pls = PLSRegression(n_components=nb_comp,max_iter=200)
        pls.fit(X1_train_pls, Y1_train_pls)
        Y1_pred=pls.predict(X1_test_pls)

        Scores.append(scoreregression(Y1_test_pls,Y1_pred))

    print(np.mean(Scores),"nb_feat:",nb_features,"nb_comp:",nb_comp,sep=" ")

In [24]:
nb_comp=[1,2,4,6,8,10]

for i in nb_comp:
    for j in X_pca:   #here you choose which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
        pls_regression(X1_val,Y1_val,j,i)

0.4646835782297223 nb_feat: 10 nb_comp: 1
0.47404944383573966 nb_feat: 12 nb_comp: 1
0.4757976771001632 nb_feat: 15 nb_comp: 1
0.4756746535682469 nb_feat: 17 nb_comp: 1
0.4769720675498269 nb_feat: 20 nb_comp: 1
0.4654788523078771 nb_feat: 10 nb_comp: 2
0.473859414642248 nb_feat: 12 nb_comp: 2
0.4762849353150128 nb_feat: 15 nb_comp: 2
0.4764706069560436 nb_feat: 17 nb_comp: 2
0.4770223421989496 nb_feat: 20 nb_comp: 2
0.4652507987002097 nb_feat: 10 nb_comp: 4
0.4737156732717459 nb_feat: 12 nb_comp: 4
0.47639723713135884 nb_feat: 15 nb_comp: 4
0.4764154336145602 nb_feat: 17 nb_comp: 4
0.47705964952393537 nb_feat: 20 nb_comp: 4
0.4652507987002097 nb_feat: 10 nb_comp: 6
0.4737156732717459 nb_feat: 12 nb_comp: 6
0.47639723713135884 nb_feat: 15 nb_comp: 6
0.4764154336145602 nb_feat: 17 nb_comp: 6
0.47705964952393537 nb_feat: 20 nb_comp: 6
0.4652507987002097 nb_feat: 10 nb_comp: 8
0.4737156732717459 nb_feat: 12 nb_comp: 8
0.47639723713135884 nb_feat: 15 nb_comp: 8
0.4764154336145602 nb_feat: 1

In [20]:
"""
scaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)
X1_normalized = scaler_bis.fit_transform(X1_val)

pca_bis = PCA(n_components=X1.shape[-1])

data_transformed = pca_bis.fit_transform(X1_normalized)
eig_val = pca_bis.explained_variance_
eig_vec = pca_bis.components_

# Compute an array E, where E(P) indicates the variance captured in the first P component.
E = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])
tau = 0.95 # Threshold

# Find the minimum P that captures \tau portion of the variance
P = np.where(E>tau)[0][0] +1

print('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))
fig = plt.figure(figsize=(20,5))
ax1 = fig.add_subplot() #(131)
ax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)
ax1.set_xlabel('number of components')
ax1.set_ylabel('Preserved variance')
"""

"\nscaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)\nX1_normalized = scaler_bis.fit_transform(X1_val)\n\npca_bis = PCA(n_components=X1.shape[-1])\n\ndata_transformed = pca_bis.fit_transform(X1_normalized)\neig_val = pca_bis.explained_variance_\neig_vec = pca_bis.components_\n\n# Compute an array E, where E(P) indicates the variance captured in the first P component.\nE = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])\ntau = 0.95 # Threshold\n\n# Find the minimum P that captures \tau portion of the variance\nP = np.where(E>tau)[0][0] +1\n\nprint('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))\nfig = plt.figure(figsize=(20,5))\nax1 = fig.add_subplot() #(131)\nax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)\nax1.set_xlabel('number of components')\nax1.set_ylabel('Preserved variance')\n"

In [21]:
"""
parameters={
'learning_rate': ["adaptive"],
'hidden_layer_sizes':[(20),(30),(35),(20,20),(30,30),(35,35)],
'alpha': [0.001,0.0001,0.00001],
'activation': ["relu"]
}

mlp=MLPRegressor()

clf = GridSearchCV(mlp, parameters)

clf.fit(X1_train_pca, Y1_train)

print(clf.best_params_)
"""

'\nparameters={\n\'learning_rate\': ["adaptive"],\n\'hidden_layer_sizes\':[(20),(30),(35),(20,20),(30,30),(35,35)],\n\'alpha\': [0.001,0.0001,0.00001],\n\'activation\': ["relu"]\n}\n\nmlp=MLPRegressor()\n\nclf = GridSearchCV(mlp, parameters)\n\nclf.fit(X1_train_pca, Y1_train)\n\nprint(clf.best_params_)\n'