## Online News

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from neupy import algorithms, layers, estimators, environment
from tqdm import tqdm


df_train = pd.read_csv("NormalizedPCA.csv")
df_test = pd.read_csv("NormalizedPCA_test.csv")

df_train.head(2)

Unnamed: 0.1,Unnamed: 0,shares,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,shares_normalized
0,1,3800,-0.514464,0.270349,0.150524,0.658502,-0.40058,-1.14124,0.653797,-0.123796,...,-1.564416,-0.367786,-0.34344,0.761752,-0.13682,-0.336795,-0.2384,-0.050292,-0.004404,0.550262
1,2,1500,-0.962429,-0.916608,-0.05813,0.201494,0.517517,-0.486776,-0.234707,-0.538595,...,-0.199748,-0.173496,-0.515639,-0.413213,0.135065,-6.1e-05,-0.193192,-0.070345,-0.003562,0.217121


In [2]:
df_train.columns

Index(['Unnamed: 0', 'shares', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7',
       'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16',
       'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25',
       'shares_normalized'],
      dtype='object')

In [3]:
X_train = df_train.loc[:,'PC1':'PC20']
y_train = df_train['shares']

X_test = df_test.loc[:,'PC1':'PC20']
y_test = df_test['shares']

## MLP

In [4]:
result = []
for thisactivation in tqdm(['tanh', 'relu', 'logistic']):
    for thisalpha in [0.0001, 1.0, 100]:
        for sz in [10, 50, 100]:
            mlpreg = MLPRegressor(hidden_layer_sizes= [sz,sz,sz],
                                 activation=thisactivation,
                                 alpha=thisalpha, solver='lbfgs').fit(X_train, y_train)
            
            result.append([sz, thisalpha, thisactivation, mlpreg.score(X_test, y_test)])

# The coefficient R^2 is defined as (1 - u/v), 
# where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and 
# v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). 
# The best possible score is 1.0 and it can be negative (because the model 
# can be arbitrarily worse). A constant model that always predicts the 
# expected value of y, disregarding the input features, would get a R^2 score of 0.0.

results = pd.DataFrame(result, columns=['size x size', 'alpha', 'activation', 'R^2 score'])
results = results.sort_values('R^2 score', ascending=0)
results.head(5)

100%|██████████| 3/3 [06:54<00:00, 138.14s/it]


Unnamed: 0,size x size,alpha,activation,R^2 score
4,50,1.0,tanh,0.002131
21,10,1.0,logistic,0.000774
1,50,0.0001,tanh,0.000694
18,10,0.0001,logistic,0.000247
6,10,100.0,tanh,0.000225


## GRNN

In [5]:
result = []

for s in tqdm(np.arange(5e-2, 0.5, 5e-2)):
    grnnet = algorithms.GRNN(std=s, verbose=False)
    grnnet.train(X_train, y_train)
    p_result = grnnet.predict(X_test)
    result.append([s, estimators.rmsle(p_result, y_test)])
        
results = pd.DataFrame(result, columns=['std', 'RMSLE'])
results = results.sort_values('RMSLE', ascending=1)
results.head(5)

100%|██████████| 9/9 [06:33<00:00, 43.68s/it]


Unnamed: 0,std,RMSLE
0,0.05,
1,0.1,
2,0.15,
3,0.2,
4,0.25,
