# MAT330 - 3. Try a linear model
<span style="color:blue"> ** This notebook performs a regression with data leakage** </span>

<span style="color:red"> ** -Compare the performance between a correct training/validation split and a split with leakage? ** </span>


In [None]:
%matplotlib inline
import os
import numpy as np
from scipy import io
import matplotlib.pyplot as plt
import pandas as pd

## Load data

In [None]:
train_filename = 'https://raw.githubusercontent.com/brajard/MAT330-Practical-work/master/data/train.csv'
train_filename = './data/train.csv'
data_train = pd.read_csv(train_filename)

# Extract the predictor (but not the target -> data leakage)
X = data_train.drop('target',axis=1, inplace=False)
y = data_train['target']

# Do the same with the test dataset
test_filename = 'https://raw.githubusercontent.com/brajard/MAT330-Practical-work/master/data/test.csv'
test_filename = './data/test.csv'
Xtest = pd.read_csv(test_filename)


## Linear model

### 1. Feature selection

In [None]:
from sklearn.preprocessing import StandardScaler

#Use all the features with no preprocessing expect stormid
Xin = X.drop('stormid', axis=1)
Xin_test = Xtest.drop('stormid', axis=1)


# Equalization of the types:
Xin = Xin.astype(float)
Xin_test = Xin_test.astype(float)

### 2. Train into Val/Train

#### Correct split:

In [None]:
from sklearn.utils import shuffle
np.random.seed(10)

#Selects the different stormid to split into train/validation
ids = shuffle(X.stormid.unique())

#Take 80% for training
limit_train = int(.8*len(ids))

#Index of training/val
idx_train = X.index[X.stormid.isin(ids[:limit_train])]
idx_val = X.index[X.stormid.isin(ids[limit_train:])]

#Split the dataset into train/validation
X_train, y_train = Xin.loc[idx_train], y.loc[idx_train]
X_val, y_val = Xin.loc[idx_val], y.loc[idx_val]

#### Split with leakage:

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(10)
X_train_leak, X_val_leak, y_train_leak, y_val_leak = train_test_split(Xin, y)

### 3. Standardization 


In [None]:
#With the correct datasets
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(Xin_test)

#With leakage
scaler = StandardScaler().fit(X_train)
X_train_scaled_leak = scaler.transform(X_train_leak)
X_val_scaled_leak = scaler.transform(X_val_leak)

### 4. Training

In [None]:
from sklearn.ensemble import RandomForestRegressor


#On correct set
rfreg = RandomForestRegressor()
rfreg.fit(X_train_scaled,y_train)

#On set with leakage
#On correct set
rfreg_leak = RandomForestRegressor()
rfreg_leak.fit(X_train_scaled_leak,y_train_leak)

### 5. Validate the model

In [None]:
y_val_predict = rfreg.predict(X_val_scaled)
y_val_predict_leak = rfreg_leak.predict(X_val_scaled_leak)

plt.scatter(y_val,y_val_predict,label='correct')
plt.scatter(y_val_leak,y_val_predict_leak,label='leak')
plt.legend()
plt.plot([0,140],[0,140],'-k')
plt.show()

In [None]:
score = rfreg.score(X_val_scaled,y_val)
score_leak = rfreg_leak.score(X_val_scaled_leak,y_val_leak)
print('linear regression score (correct): {:.3f}'.format(score))
print('linear regression score (with leakage): {:.3f}'.format(score_leak))