In [2]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib
from matplotlib import pyplot as plt

import scipy
from scipy.stats import zscore

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression

from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import PLSRegression

showfig = False
delOutliers = False
downSizing  = False

In [None]:
#load data 
X1 = pd.read_csv("../Data/X1.csv")
Y1 = pd.read_csv("../Data/Y1.csv",header=None,names =['shares'])

if showfig:
    fig = plt.figure(figsize=(6.4*2,4.8*6))
    gs  = fig.add_gridspec(nrows=12, ncols=5)
    for (i,header) in enumerate(X1.columns):
        ax = fig.add_subplot(gs[int(i/5),i%5])
        ax.scatter(X1[header],Y1.values, s=5)
        ax.set_xlabel(header)
    fig.tight_layout()

    #fig.savefig("../Figs/feature_plots.jpg")

X1_val = X1.values
Y1_val = Y1.values

X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1_val, Y1_val,random_state=1, test_size=0.2)

X1_val = X1_train
Y1_val = Y1_train

In [4]:
if delOutliers:
    # Removing outliers
    z_scores = zscore(X1_val)
    abs_z_scores = np.abs(z_scores)
    X1_filtered_ind = (abs_z_scores < 4).all(axis=1)
    X1_val = X1_val[X1_filtered_ind]
    Y1_val = Y1_val[X1_filtered_ind]

In [5]:
if downSizing:  
    corr = np.corrcoef(X1_val.transpose())
    corr = np.triu(corr) - np.eye(corr.shape[0]) 
    delt = np.where(corr>1-1e-1)

    keep = np.where([idx not in delt[0] for idx in range(corr.shape[0])])[0]
    X1_val = X1_val[:,keep]

In [6]:
# Score Computation
def scoref1(ytrue, ypred, th):
    return sklearn.metrics.f1_score(ytrue>th, ypred>th)

def scoreregression(ytrue, ypred):
    scores = [
        scoref1(ytrue, ypred, th=th) for th in [ 500, 1400, 5000, 10000]
    ]
    return np.mean(scores)

In [None]:
if False:
    scaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)
    X1_normalized = scaler_bis.fit_transform(X1_val)

    pca_bis = PCA(n_components=X1.shape[-1])

    data_transformed = pca_bis.fit_transform(X1_normalized)
    eig_val = pca_bis.explained_variance_
    eig_vec = pca_bis.components_

    # Compute an array E, where E(P) indicates the variance captured in the first P component.
    E = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])
    tau = 0.95 # Threshold

    # Find the minimum P that captures \tau portion of the variance
    P = np.where(E>tau)[0][0] +1

    print('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))
    fig = plt.figure()
    ax1 = fig.add_subplot() #(131)
    ax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=5, color='blue', alpha=0.5)
    ax1.set_xlabel('number of components')
    ax1.set_ylabel('Preserved variance')

    #fig.savefig("../Figs/PCA.pdf")

In [8]:
# Feature Selections
def correlation_selection(X,Y,nb):
    data = np.concatenate((np.transpose(X),np.transpose(Y)))
    corr = np.corrcoef(data)[:-1,-1]
    idxs = np.argpartition(corr, -nb)[-nb:]
    return X[:, idxs]

def PCA_selection(X,nb):
    # Initiate StandardScaler class
    scaler = StandardScaler(copy=True,with_mean=True,with_std=True)
    # Initialize PCA
    pca = PCA(n_components=nb)
    X_pca = pca.fit_transform(scaler.fit_transform(X))
    return X_pca

def mutual_info_selection(X,Y,nb):
    mutual_information = mutual_info_regression(X,Y)
    idxs = np.argpartition(mutual_information, -nb)[-nb:]
    return X[:, idxs]

In [9]:
X_correlation = []
for nb in [10,12,15,17,20]:
    X_correlation.append(correlation_selection(X1_val,Y1_val, nb))

In [10]:
X_pca = []
for nb in [10,12,15,17,20]:
    X_pca.append(PCA_selection(X1_val, nb))

In [11]:
X_mutual_info = []
for nb in [10,12,15,17,20]:
    X_mutual_info.append(mutual_info_selection(X1_val,Y1_val, nb))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
#linear regression
def linear_regression(X,Y, f_selection):
    score = []
    kf = KFold(n_splits=5,shuffle=False)
    regr = linear_model.LinearRegression()

    for train,test in kf.split(f_selection):
        #cross valdation
        X1_train_corr = f_selection[train]
        X1_test_corr  = f_selection[test]
        Y1_train_corr = Y[train]
        Y1_test_corr  = Y[test]

        regr.fit(X1_train_corr, Y1_train_corr)
        Y1_pred = regr.predict(X1_test_corr)

        score.append(scoreregression(Y1_test_corr,Y1_pred))

    return np.mean(score)

In [14]:
#Linear regression model selection
feat_selection = "corr" # Choose here which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
if feat_selection=="corr":
    file = open("../Scores/linear_corr.txt", "w")
    feat_vector = X_correlation
elif feat_selection=="pca":
    file = open("../Scores/linear_pca.txt", "w")
    feat_vector = X_pca
elif feat_selection=="mutual_info":
    file = open("../Scores/linear_mutual_info.txt", "w")
    feat_vector = X_mutual_info

file.write("score, nb_features\n")
for feat in feat_vector:
    score = linear_regression(X1_val,Y1_val, feat)
    file.write("%.5f, %d\n" %(score, feat.shape[1]))
file.close()

In [15]:
#knn regression
def knn_regression(X,Y,f_selection,nb_neigh):
    score = []
    kf  = KFold(n_splits=5,shuffle=False)
    knn = KNeighborsRegressor(nb_neigh)

    for train,test in kf.split(f_selection):
        #cross valdation
        X1_train_knn = f_selection[train]
        X1_test_knn  = f_selection[test]
        Y1_train_knn = Y[train]
        Y1_test_knn  = Y[test]

        knn.fit(X1_train_knn, Y1_train_knn)
        Y1_pred = knn.predict(X1_test_knn)

        score.append(scoreregression(Y1_test_knn,Y1_pred))

    return np.mean(score)

In [16]:
#KNN model selection
neighbours=[8,9,10,11,12,13,14,15,16,17,18,19,20]

feat_selection = "pca"  # Choose here which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
if feat_selection=="corr":
    file = open("../Scores/knn_corr.txt", "w")
    feat_vector = X_correlation
elif feat_selection=="pca":
    file = open("../Scores/knn_pca.txt", "w")
    feat_vector = X_pca
elif feat_selection=="mutual_info":
    file = open("../Scores/knn_mutual_info.txt", "w")
    feat_vector = X_mutual_info

file.write("score, nb_features, nb_neighbours\n")
for ngb in neighbours:
    for feat in feat_vector:
        score=knn_regression(X1_val,Y1_val, feat,ngb)
        file.write("%.5f, %d, %d\n" %(score, feat.shape[1], ngb))
file.close()

In [17]:
#mlp regression
def mlp_regression(X,Y,f_selection,layers,learning_r):
    score = []
    kf  = KFold(n_splits=5,shuffle=False)
    reg = MLPRegressor(hidden_layer_sizes=layers,
                       activation="relu",
                       learning_rate=learning_r,
                       max_iter=200)

    for train,test in kf.split(f_selection):
        #cross valdation
        X1_train_mlp = f_selection[train]
        X1_test_mlp  = f_selection[test]
        Y1_train_mlp = Y[train]
        Y1_test_mlp  = Y[test]

        reg.fit(X1_train_mlp, Y1_train_mlp)
        Y1_pred = reg.predict(X1_test_mlp)

        score.append(scoreregression(Y1_test_mlp,Y1_pred))

    return np.mean(score)

In [18]:
#MLP model selection
layers = [(10),(12),(15),(17),(10,10),(12,12),(15,15),(17,17)]
learning = ["constant","invscaling","adaptive"]

feat_selection = "pca"  # Choose here which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
if feat_selection=="corr":
    file = open("../Scores/mlp_corr.txt", "w")
    feat_vector = X_correlation
elif feat_selection=="pca":
    file = open("../Scores/mlp_pca.txt", "w")
    feat_vector = X_pca
elif feat_selection=="mutual_info":
    file = open("../Scores/mlp_mutual_info.txt", "w")
    feat_vector = X_mutual_info

file.write("score, nb_features, nb_layers, learning_rate\n")
for lyr in layers:
    for feat in feat_vector:
        for lrn in learning:
            score = mlp_regression(X1_val,Y1_val, feat,lyr,lrn)
            if type(lyr)==int:
                file.write("%.5f, %d, %d, %s\n" %(score,feat.shape[1],lyr,lrn))
            else:
                file.write("%.5f, %d, (%d,%d), %s\n" %(score,feat.shape[1],lyr[0],lyr[1],lrn))
file.close()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [19]:
#pls regression
def pls_regression(X,Y, f_selection,nb_comp):
    score = []
    kf  = KFold(n_splits=5,shuffle=False)
    pls = PLSRegression(n_components=nb_comp,max_iter=200)

    for train,test in kf.split(f_selection):
        #cross valdation
        X1_train_pls = f_selection[train]
        X1_test_pls  = f_selection[test]
        Y1_train_pls = Y[train]
        Y1_test_pls  = Y[test]

        pls.fit(X1_train_pls, Y1_train_pls)
        Y1_pred = pls.predict(X1_test_pls)

        score.append(scoreregression(Y1_test_pls,Y1_pred))

    return np.mean(score)

In [20]:
#Pls model selection
nb_components = [1,2,4,6,8,10]

feat_selection = "corr"  # Choose here which kind of feature selection to use(X_correlation,X_pca,X_mutual_info)
if feat_selection=="corr":
    file = open("../Scores/pls_corr.txt", "w")
    feat_vector = X_correlation
elif feat_selection=="pca":
    file = open("../Scores/pls_pca.txt", "w")
    feat_vector = X_pca
elif feat_selection=="mutual_info":
    file = open("../Scores/pls_mutual_info.txt", "w")
    feat_vector = X_mutual_info

file.write("score, nb_features, nb_components\n")
for nb in nb_components:
    for feat in feat_vector:
        score = pls_regression(X1_val,Y1_val, feat,nb)
        file.write("%.5f, %d, %d\n" %(score,feat.shape[1],nb))
file.close()

In [21]:
#final model evaluation
X_pca_train_15 = PCA_selection(X1_val,15)
X_pca_test_15 = PCA_selection(X1_test,15)

knn = KNeighborsRegressor(18)
knn.fit(X_pca_train_15,Y1_val)
Y1_pred = knn.predict(X_pca_test_15)

print(scoreregression(Y1_test,Y1_pred))

0.482253981080392
