In [14]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib
from matplotlib import pyplot as plt

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_regression

try:
    import seaborn as sns
    use_seaborn = True
    sns.set()
except:
    use_seaborn = False

showfig = False

In [15]:
#load data 
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv",header=None,names =['shares'])

if showfig:
    fig = plt.figure(figsize=(6.4*2,4.8*6))
    gs  = fig.add_gridspec(nrows=12, ncols=5)
    for (i,header) in enumerate(X1.columns):
        ax = fig.add_subplot(gs[int(i/5),i%5])
        ax.scatter(X1[header],Y1.values, s=5)
        ax.set_xlabel(header)
    fig.tight_layout()

X1_val = X1.values
Y1_val = Y1.values

In [16]:
#score computation
def scoref1(ytrue, ypred, th):
    return sklearn.metrics.f1_score(ytrue>th, ypred>th)

def scoreregression(ytrue, ypred):
    scores = [
        scoref1(ytrue, ypred, th=th) for th in [ 500, 1400, 5000, 10000]
    ]
    return np.mean(scores)

In [17]:
#feature selections
def correlation_selection(X,Y):
    data = np.concatenate((np.transpose(X),np.transpose(Y)))
    corr = np.corrcoef(data)
    idxs = np.argpartition(corr[:][58], -4)[-4:]
    X_corr = X[:,idxs[:-1]]

    return X_corr

def PCA_selection(X,nb_components):
    # To do 1: initiate StandardScaler class
    scaler = StandardScaler(copy=True,with_mean=True,with_std=True)
    # Initialize PCA
    pca = PCA(n_components=nb_components)
    X_pca = pca.fit_transform(scaler.fit_transform(X))
    
    return X_pca

def mutual_info_selection(X,Y):
    mutual_information = mutual_info_regression(X,Y)
    idxs = np.argpartition(mutual_information[:][57], -4)[-4:]
    X_mutual_info = X[:,idxs[:-1]]

In [5]:
def linear_regression(X,Y):
    X_corr=correlation_selection(X,Y)

    X1_train_corr, X1_test_corr, Y1_train_corr, Y1_test_corr = train_test_split(X_corr,Y1_val,random_state=1,test_size=0.3)

    regr = linear_model.LinearRegression()
    regr.fit(X1_train_corr, Y1_train_corr)
    Y1_pred = regr.predict(X1_test_corr)
    
    print(scoreregression(Y1_test_corr,Y1_pred))

In [6]:
linear_regression(X1_val,Y1_val)

0.47776142900688745


In [7]:
def knn_regression(X):
    X_knn=PCA_selection(X1_val,5)

    X1_train_pca,X1_test_pca,Y1_train_pca,Y1_test_pca = train_test_split(X_knn,Y1_val,random_state=1,test_size=0.3)

    knn = KNeighborsRegressor(20)
    #knn.fit(X1_train_pca, Y1_train.ravel())
    knn.fit(X1_train_pca, Y1_train_pca)
    Y1_pred = knn.predict(X1_test_pca)

    print(scoreregression(Y1_test_pca, Y1_pred))

In [8]:
knn_regression(X1_val)

0.4954803984890093


In [9]:
def mlp_regression(X):
    X_mlp=PCA_selection(X1_val,25)

    X1_train_pca,X1_test_pca,Y1_train_pca,Y1_test_pca = train_test_split(X_mlp,Y1_val,random_state=1,test_size=0.3)

    reg = MLPRegressor(hidden_layer_sizes=(20,20),activation="relu" ,random_state=1, max_iter=200)
    reg.fit(X1_train_pca, Y1_train_pca)
    Y1_pred=reg.predict(X1_test_pca)

    print(scoreregression(Y1_test_pca, Y1_pred))


In [10]:
mlp_regression(X1_val)

0.48769223361889025


In [11]:
"""
scaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)
X1_normalized = scaler_bis.fit_transform(X1_val)

pca_bis = PCA(n_components=X1.shape[-1])

data_transformed = pca_bis.fit_transform(X1_normalized)
eig_val = pca_bis.explained_variance_
eig_vec = pca_bis.components_

# Compute an array E, where E(P) indicates the variance captured in the first P component.
E = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])
tau = 0.95 # Threshold

# Find the minimum P that captures \tau portion of the variance
P = np.where(E>tau)[0][0] +1

print('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))
fig = plt.figure(figsize=(20,5))
ax1 = fig.add_subplot() #(131)
ax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)
ax1.set_xlabel('number of components')
ax1.set_ylabel('Preserved variance')
"""

"\nscaler_bis = StandardScaler(copy=True,with_mean=True,with_std=True)\nX1_normalized = scaler_bis.fit_transform(X1_val)\n\npca_bis = PCA(n_components=X1.shape[-1])\n\ndata_transformed = pca_bis.fit_transform(X1_normalized)\neig_val = pca_bis.explained_variance_\neig_vec = pca_bis.components_\n\n# Compute an array E, where E(P) indicates the variance captured in the first P component.\nE = np.array([eig_val[:p+1].sum()/eig_val.sum() for p in range(len(eig_val))])\ntau = 0.95 # Threshold\n\n# Find the minimum P that captures \tau portion of the variance\nP = np.where(E>tau)[0][0] +1\n\nprint('Minimum number of components that preserve {} of the variance = {} \n' .format(tau,P))\nfig = plt.figure(figsize=(20,5))\nax1 = fig.add_subplot() #(131)\nax1.plot(np.arange(1,E.shape[0]+1), E, 'o-', markersize=8, color='blue', alpha=0.5)\nax1.set_xlabel('number of components')\nax1.set_ylabel('Preserved variance')\n"

In [12]:
"""
parameters={
'learning_rate': ["adaptive"],
'hidden_layer_sizes':[(20),(30),(35),(20,20),(30,30),(35,35)],
'alpha': [0.001,0.0001,0.00001],
'activation': ["relu"]
}

mlp=MLPRegressor()

clf = GridSearchCV(mlp, parameters)

clf.fit(X1_train_pca, Y1_train)

print(clf.best_params_)
"""

'\nparameters={\n\'learning_rate\': ["adaptive"],\n\'hidden_layer_sizes\':[(20),(30),(35),(20,20),(30,30),(35,35)],\n\'alpha\': [0.001,0.0001,0.00001],\n\'activation\': ["relu"]\n}\n\nmlp=MLPRegressor()\n\nclf = GridSearchCV(mlp, parameters)\n\nclf.fit(X1_train_pca, Y1_train)\n\nprint(clf.best_params_)\n'