In [4]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib
from matplotlib import pyplot as plt

import scipy
from scipy.stats import zscore

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold

try:
    import seaborn as sns
    use_seaborn = True
    sns.set()
except:
    use_seaborn = False

showfig = False

In [5]:
#load data 
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv",header=None,names =['shares'])

if showfig:
    fig = plt.figure(figsize=(6.4*2,4.8*6))
    gs  = fig.add_gridspec(nrows=12, ncols=5)
    for (i,header) in enumerate(X1.columns):
        ax = fig.add_subplot(gs[int(i/5),i%5])
        ax.scatter(X1[header],Y1.values, s=5)
        ax.set_xlabel(header)
    fig.tight_layout()

X1_val = X1.values
Y1_val = Y1.values

In [6]:
#Removing outliers
z_scores = zscore(X1_val)
abs_z_scores = np.abs(z_scores)
X1_filtered_ind = (abs_z_scores < 4).all(axis=1)
X1_filtered = X1_val[X1_filtered_ind]
Y1_filtered = Y1_val[X1_filtered_ind]

In [7]:
#score computation
def scoref1(ytrue, ypred, th):
    return sklearn.metrics.f1_score(ytrue>th, ypred>th)

def scoreregression(ytrue, ypred):
    scores = [
        scoref1(ytrue, ypred, th=th) for th in [ 500, 1400, 5000, 10000]
    ]
    return np.mean(scores)

In [8]:
#feature selections
def correlation_selection(X,Y,nb):
    data = np.concatenate((np.transpose(X),np.transpose(Y)))
    corr = np.corrcoef(data)[:-1]
    idxs = np.argpartition(corr[:][57], -nb)[-nb:]
    X_corr = X[:,idxs]
    #ici il y a encore un souci!!!!
    return X_corr

def PCA_selection(X,nb):
    # To do 1: initiate StandardScaler class
    scaler = StandardScaler(copy=True,with_mean=True,with_std=True)
    # Initialize PCA
    pca = PCA(n_components=nb)
    X_pca = pca.fit_transform(scaler.fit_transform(X))
    
    return X_pca

def mutual_info_selection(X,Y,nb):
    mutual_information = mutual_info_regression(X,Y)
    idxs = np.argpartition(mutual_information,-nb)[-nb:]
    X_mutual_info = X[:,idxs[:]]

    return X_mutual_info

In [9]:
X_mutual_info=mutual_info_selection(X1_val,Y1_val,17)

In [10]:
np.shape(X_mutual_info)

(19822, 17)

In [54]:
def linear_regression(X,Y,nb_features):
    X_corr=correlation_selection(X,Y,nb_features)
    #X_corr=PCA_selection(X,nb_features)
    #X_corr=X_mutual_info

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_corr):
        
        X1_train_corr=X_corr[train]
        X1_test_corr=X_corr[test]
        Y1_train_corr=Y[train]
        Y1_test_corr=Y[test]

        regr = linear_model.LinearRegression()
        regr.fit(X1_train_corr, Y1_train_corr)
        Y1_pred = regr.predict(X1_test_corr)
    
        Scores.append(scoreregression(Y1_test_corr,Y1_pred))

    print(np.mean(Scores),nb_features,sep="   ")

In [61]:
linear_regression(X1_val,Y1_val,20)

0.4849599136705819   20


In [46]:
def knn_regression(X,Y,nb_features):
    #X_knn=correlation_selection(X,Y,nb_features)
    #X_knn=PCA_selection(X,nb_features)
    X_knn=X_mutual_info

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_knn):
        
        X1_train_pca=X_knn[train]
        X1_test_pca=X_knn[test]
        Y1_train_pca=Y[train]
        Y1_test_pca=Y[test]

        knn = KNeighborsRegressor(nb_features)
        knn.fit(X1_train_pca, Y1_train_pca)
        Y1_pred = knn.predict(X1_test_pca)

        Scores.append(scoreregression(Y1_test_pca,Y1_pred))

    print(np.mean(Scores),nb_features,sep="   ")

In [47]:
neighbours=[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]

for i in neighbours:
    knn_regression(X1_val,Y1_val,i)

0.4724468058999357   3
0.4778710804654097   4
0.48487126273445424   5
0.4874683355818049   6
0.4870852501523822   7
0.48747334741499204   8
0.4875302235782426   9
0.4902332738810177   10
0.48828877340797383   11
0.48881064317901063   12
0.48949613516796253   13
0.4889513607062398   14
0.48734851370808724   15
0.4878633868827243   16
0.4879243791722162   17
0.4870469044371368   18
0.48639423559770273   19
0.4856509179624937   20
0.48739001546220545   21
0.48594623815054394   22
0.48556652780460957   23
0.48510174632537434   24


In [34]:
def mlp_regression(X,Y,nb_features,layers):
    #X_mlp=correlation_selection(X,Y,nb_features)
    #X_mlp=PCA_selection(X,nb_features)
    X_mlp=X_mutual_info

    Scores=[]
    kf=KFold(n_splits=5,shuffle=False)

    for train,test in kf.split(X_mlp):
        
        X1_train_pca=X_mlp[train]
        X1_test_pca=X_mlp[test]
        Y1_train_pca=Y[train]
        Y1_test_pca=Y[test]

        reg = MLPRegressor(hidden_layer_sizes=layers,activation="relu" ,random_state=1, max_iter=200)
        reg.fit(X1_train_pca, Y1_train_pca)
        Y1_pred=reg.predict(X1_test_pca)

        Scores.append(scoreregression(Y1_test_pca,Y1_pred))

    print(np.mean(Scores),nb_features,layers,sep="   ")


In [35]:
mlp_regression(X1_val,Y1_val,17,(15))

0.48926678621854086   17   15


In [36]:
layers=[(15),(15,15),(12,12),(13,13),(14,14),(16,16),(10,10)]
nb=[17]

for i in layers:
    for j in nb:
        mlp_regression(X1_val,Y1_val,j,i)



0.48926678621854086   17   15
0.4884931424260236   17   (15, 15)
0.4859887247245722   17   (12, 12)
0.489980447363199   17   (13, 13)
0.48756894162883385   17   (14, 14)
0.49392601059907826   17   (16, 16)
0.48786532500115964   17   (10, 10)


In [29]:
"""
scaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)
X1_normalized = scaler_bis.fit_transform(X1_val)

pca_bis = PCA(n_components=X1.shape[-1])

data_transformed = pca_bis.fit_transform(X1_normalized)
eig_val = pca_bis.explained_variance_
eig_vec = pca_bis.components_

# Compute an array E, where E(P) indicates the variance captured in the first P component.
E = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])
tau = 0.95 # Threshold

# Find the minimum P that captures \tau portion of the variance
P = np.where(E>tau)[0][0] +1

print('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))
fig = plt.figure(figsize=(20,5))
ax1 = fig.add_subplot() #(131)
ax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)
ax1.set_xlabel('number of components')
ax1.set_ylabel('Preserved variance')
"""

"\nscaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)\nX1_normalized = scaler_bis.fit_transform(X1_val)\n\npca_bis = PCA(n_components=X1.shape[-1])\n\ndata_transformed = pca_bis.fit_transform(X1_normalized)\neig_val = pca_bis.explained_variance_\neig_vec = pca_bis.components_\n\n# Compute an array E, where E(P) indicates the variance captured in the first P component.\nE = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])\ntau = 0.95 # Threshold\n\n# Find the minimum P that captures \tau portion of the variance\nP = np.where(E>tau)[0][0] +1\n\nprint('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))\nfig = plt.figure(figsize=(20,5))\nax1 = fig.add_subplot() #(131)\nax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)\nax1.set_xlabel('number of components')\nax1.set_ylabel('Preserved variance')\n"

In [30]:
"""
parameters={
'learning_rate': ["adaptive"],
'hidden_layer_sizes':[(20),(30),(35),(20,20),(30,30),(35,35)],
'alpha': [0.001,0.0001,0.00001],
'activation': ["relu"]
}

mlp=MLPRegressor()

clf = GridSearchCV(mlp, parameters)

clf.fit(X1_train_pca, Y1_train)

print(clf.best_params_)
"""

'\nparameters={\n\'learning_rate\': ["adaptive"],\n\'hidden_layer_sizes\':[(20),(30),(35),(20,20),(30,30),(35,35)],\n\'alpha\': [0.001,0.0001,0.00001],\n\'activation\': ["relu"]\n}\n\nmlp=MLPRegressor()\n\nclf = GridSearchCV(mlp, parameters)\n\nclf.fit(X1_train_pca, Y1_train)\n\nprint(clf.best_params_)\n'