In [None]:

# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)




In [2]:
import pandas as pd

test= pd.read_csv(r"C:\Users\alesi\Documents\group_project_ml\test.csv")
train= pd.read_csv(r"C:\Users\alesi\Documents\group_project_ml\train.csv")

In [3]:
print(test)
print(train)

             0         1         2         3         4         5         6  \
0    -0.670595 -0.839068 -0.734415 -0.587261 -0.788800 -0.975857 -0.774088   
1     0.188165  0.166410  0.321011  0.318078  0.641710  0.951932  1.170069   
2     0.886510  0.760716  0.751800  0.052198 -0.050958 -0.140734 -0.173480   
3    -0.187722  0.030199 -0.072558 -0.098400 -0.110795 -0.127632 -0.241193   
4     0.248822  0.168815  0.260804  0.505885  0.471486  1.018661  0.971406   
...        ...       ...       ...       ...       ...       ...       ...   
3136  1.033977  1.043510  1.029088  1.038866  1.044243  1.032999  1.034221   
3137  0.998393  0.997933  0.989993  1.001384  0.995516  0.983090  0.989993   
3138  1.288652  1.279170  1.271384  1.280467  1.276575  1.245434  1.208504   
3139  0.804169  0.821925  0.824755  0.836336  0.840196  0.827586  0.834277   
3140  1.063461  1.063461  1.065692  1.081557  1.058999  1.056272  1.066931   

             7         8         9  ...        40        41    

In [4]:
train.head()

Unnamed: 0,w,y,0,1,2,3,4,5,6,7,...,40,41,42,43,44,45,46,47,48,49
0,1,0.48313,0.79098,0.702555,0.52822,0.298746,0.025488,-0.17348,-0.24529,-0.405057,...,0.954288,1.143901,1.359252,1.081061,1.364409,1.449354,1.195431,1.195992,1.165327,0.77111
1,1,1.135624,0.765286,0.604512,0.414197,0.241638,0.181862,-0.03192,-0.070617,-0.18598,...,0.561545,0.678086,0.84895,1.133852,1.041396,1.242806,1.248121,1.331348,1.267123,1.292718
2,1,0.686081,0.702834,0.637708,0.798416,0.755065,0.705225,0.535391,0.613129,0.549732,...,1.328694,1.324254,1.272889,1.074786,0.75395,0.539693,0.402041,0.442759,0.487557,0.699007
3,1,-1.342005,0.018378,-0.097297,-0.02085,-0.083325,-0.268512,-0.486335,-0.73113,-0.924458,...,-0.386676,-0.357946,-0.612069,-0.698063,-0.891789,-1.127624,-1.535678,-1.490786,-1.85684,-1.441472
4,1,-0.241418,0.46312,0.665307,0.446953,0.48078,0.3925,0.309231,0.158462,0.190963,...,-0.174944,-0.529541,-0.191921,-0.258093,-0.542403,-0.414866,-0.48558,-0.76882,-0.662573,-0.211837


In [5]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.670595,-0.839068,-0.734415,-0.587261,-0.7888,-0.975857,-0.774088,-1.021334,-1.110286,-0.893337,...,-0.997627,-1.103794,-1.092988,-0.989165,-0.827528,-0.813729,-0.532411,-0.289483,-0.40772,-0.407505
1,0.188165,0.16641,0.321011,0.318078,0.64171,0.951932,1.170069,1.177711,0.987763,0.981345,...,0.743405,0.916254,0.866453,0.953677,0.716259,0.692816,0.446713,0.539733,0.279293,0.180641
2,0.88651,0.760716,0.7518,0.052198,-0.050958,-0.140734,-0.17348,0.178508,0.198187,0.357906,...,0.444142,0.492294,0.573348,0.546323,0.373874,0.699132,0.808303,1.118522,1.284887,1.541929
3,-0.187722,0.030199,-0.072558,-0.0984,-0.110795,-0.127632,-0.241193,-0.374608,-0.651771,-0.513491,...,-0.340927,-0.268253,-0.654777,-1.133722,-1.484557,-1.446644,-1.654337,-1.521009,-1.593825,-1.110684
4,0.248822,0.168815,0.260804,0.505885,0.471486,1.018661,0.971406,1.062348,0.986871,0.947982,...,0.422044,0.688196,0.382416,0.344843,0.177595,0.330549,0.595061,0.88486,1.125103,1.220779


In [6]:
financial_train = train.drop("y", axis=1) # drop labels for training 
y=train["y"].copy()

In [7]:
financial_train


Unnamed: 0,w,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,1,0.790980,0.702555,0.528220,0.298746,0.025488,-0.173480,-0.245290,-0.405057,-0.371818,...,0.954288,1.143901,1.359252,1.081061,1.364409,1.449354,1.195431,1.195992,1.165327,0.771110
1,1,0.765286,0.604512,0.414197,0.241638,0.181862,-0.031920,-0.070617,-0.185980,-0.188546,...,0.561545,0.678086,0.848950,1.133852,1.041396,1.242806,1.248121,1.331348,1.267123,1.292718
2,1,0.702834,0.637708,0.798416,0.755065,0.705225,0.535391,0.613129,0.549732,0.472387,...,1.328694,1.324254,1.272889,1.074786,0.753950,0.539693,0.402041,0.442759,0.487557,0.699007
3,1,0.018378,-0.097297,-0.020850,-0.083325,-0.268512,-0.486335,-0.731130,-0.924458,-0.995159,...,-0.386676,-0.357946,-0.612069,-0.698063,-0.891789,-1.127624,-1.535678,-1.490786,-1.856840,-1.441472
4,1,0.463120,0.665307,0.446953,0.480780,0.392500,0.309231,0.158462,0.190963,0.273600,...,-0.174944,-0.529541,-0.191921,-0.258093,-0.542403,-0.414866,-0.485580,-0.768820,-0.662573,-0.211837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,1,1.002615,1.007519,0.990882,1.009554,1.009953,1.016928,0.980784,0.919103,0.957826,...,1.003996,1.015111,1.014566,1.032765,1.035272,1.036289,1.027571,1.028043,1.032729,1.021831
7322,1,1.067617,1.075324,1.068344,1.086520,1.088120,1.071688,1.086666,1.074451,1.070816,...,1.022539,1.050458,1.064563,1.055257,1.057292,1.048132,1.050167,1.048422,1.051767,1.044205
7323,1,0.976882,0.978272,0.972884,0.978098,0.977229,0.968538,0.967669,0.954632,0.958978,...,1.050409,1.073527,1.069529,1.075787,1.072658,1.066400,1.069355,1.074744,1.074917,1.068486
7324,1,0.851952,0.853579,0.871475,0.878525,0.882863,0.880152,0.886117,0.882321,0.881236,...,1.042842,1.045553,1.039588,1.060737,1.053688,1.047180,1.052061,1.050434,1.056399,1.056941


In [8]:
financial_train.isnull()

Unnamed: 0,w,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7322,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7323,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7324,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
financial_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7326 entries, 0 to 7325
Data columns (total 51 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   w       7326 non-null   int64  
 1   0       7326 non-null   float64
 2   1       7326 non-null   float64
 3   2       7326 non-null   float64
 4   3       7326 non-null   float64
 5   4       7326 non-null   float64
 6   5       7326 non-null   float64
 7   6       7326 non-null   float64
 8   7       7326 non-null   float64
 9   8       7326 non-null   float64
 10  9       7326 non-null   float64
 11  10      7326 non-null   float64
 12  11      7326 non-null   float64
 13  12      7326 non-null   float64
 14  13      7326 non-null   float64
 15  14      7326 non-null   float64
 16  15      7326 non-null   float64
 17  16      7326 non-null   float64
 18  17      7326 non-null   float64
 19  18      7326 non-null   float64
 20  19      7326 non-null   float64
 21  20      7326 non-null   float64
 22  

In [10]:
sample_incomplete_rows = financial_train[financial_train.isnull().any(axis=1)].head()
sample_incomplete_rows

Unnamed: 0,w,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49


In [19]:
financial_train.mean(axis=0)

#financial_train.std(axis=0)

w     1.000000
0     0.016903
1     0.018974
2     0.023204
3     0.027638
4     0.031068
5     0.039904
6     0.045646
7     0.051259
8     0.055237
9     0.059982
10    0.063486
11    0.060599
12    0.059525
13    0.057512
14    0.054321
15    0.049322
16    0.046435
17    0.044036
18    0.040652
19    0.041987
20    0.040917
21    0.040409
22    0.040925
23    0.046406
24    0.048168
25    0.051337
26    0.055149
27    0.055678
28    0.057295
29    0.059418
30    0.054513
31    0.052467
32    0.050844
33    0.047891
34    0.043695
35    0.041315
36    0.039177
37    0.036693
38    0.034440
39    0.030586
40    0.036622
41    0.039024
42    0.043605
43    0.045400
44    0.049477
45    0.055759
46    0.060715
47    0.070436
48    0.078284
49    0.081028
dtype: float64

In [12]:
import sys
sys.executable


'C:\\Users\\alesi\\anaconda3\\python.exe'

In [13]:
import sklearn
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
print(scaler.fit(financial_train))
print(scaler.transform(financial_train))

StandardScaler()
[[0.         1.47338019 1.30235825 ... 1.30566091 1.23364047 0.7724038 ]
 [0.         1.42447536 1.11556794 ... 1.46267542 1.34916464 1.35623541]
 [0.         1.30560409 1.17881301 ... 0.43189994 0.46446768 0.69169975]
 ...
 [0.         1.82722656 1.82765497 ... 1.16501074 1.13103868 1.10525443]
 [0.         1.58943544 1.59008974 ... 1.136811   1.11002293 1.09233256]
 [0.         2.16700344 2.16893165 ... 1.13269484 1.09770608 1.05858625]]


In [20]:
fin_train_tr=scaler.fit_transform(financial_train)

In [21]:
fin_train_tr

array([[0.        , 1.47338019, 1.30235825, ..., 1.30566091, 1.23364047,
        0.7724038 ],
       [0.        , 1.42447536, 1.11556794, ..., 1.46267542, 1.34916464,
        1.35623541],
       [0.        , 1.30560409, 1.17881301, ..., 0.43189994, 0.46446768,
        0.69169975],
       ...,
       [0.        , 1.82722656, 1.82765497, ..., 1.16501074, 1.13103868,
        1.10525443],
       [0.        , 1.58943544, 1.59008974, ..., 1.136811  , 1.11002293,
        1.09233256],
       [0.        , 2.16700344, 2.16893165, ..., 1.13269484, 1.09770608,
        1.05858625]])

In [22]:
fin_train_tr.shape

(7326, 51)

In [24]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
reg=lin_reg.fit(fin_train_tr, y)

In [25]:
reg.coef_


array([-3.22655118e-17, -3.34435281e-02, -2.18638984e-02, -4.79024293e-03,
        1.00354517e-02,  9.06220587e-03,  3.37682379e-03, -2.60761139e-02,
       -4.78667970e-03, -9.22734994e-03, -6.35231575e-03, -1.64575017e-02,
        1.18452641e-02, -4.95130008e-03,  4.94386796e-03,  4.89572843e-03,
       -1.73787263e-02,  3.21569799e-03,  9.23936053e-04,  1.92482862e-02,
        2.33734567e-02,  2.88237312e-02,  3.28647979e-02,  4.46000499e-03,
        3.52186374e-03,  1.41236562e-02,  5.80923820e-03,  1.39946248e-02,
        5.15499646e-02,  3.94230051e-02,  2.66438936e-02, -1.18676667e-02,
        2.86494500e-03, -2.85030348e-03,  3.21493729e-02,  5.47392023e-02,
        6.09082667e-02,  3.87584383e-02, -1.53845735e-02, -4.30981803e-02,
       -5.68740933e-02, -5.75771621e-03,  2.81220450e-02,  5.59567792e-02,
        3.56888898e-02, -2.53793003e-03, -1.30826349e-01, -1.70443946e-01,
       -6.74140281e-02,  2.44145141e-01,  7.11379589e-01])

In [30]:

from sklearn.metrics import mean_squared_error

fin_predictions = lin_reg.predict(fin_train_tr)
lin_mse = mean_squared_error(y, fin_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.15053862493315537

array([0.7321062 , 1.12924309, 0.7445552 , ..., 1.14963541, 1.16255139,
       1.08235199])

0       0.483130
1       1.135624
2       0.686081
3      -1.342005
4      -0.241418
          ...   
7321    1.026517
7322    1.039843
7323    1.064836
7324    1.052061
7325    1.028131
Name: y, Length: 7326, dtype: float64

In [39]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(y, fin_predictions)
lin_mae

0.12126773129689675