In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [14]:
non_correlated_data = pd.read_csv('removed_correlations.csv')
non_correlated_data = non_correlated_data.drop(['Unnamed: 0'], axis=1)
non_correlated_data.head()

Unnamed: 0,0,1,2,3,5,6
0,1.0,1.0,0.0,341.120176,95.554853,75.181807
1,0.0,0.0,0.0,118.610087,66.456064,93.011581
2,1.0,1.0,0.0,77.254576,40.766024,-1.588632
3,1.0,0.0,1.0,476.980523,144.335903,512.778893
4,1.0,0.0,1.0,258.225846,144.487597,364.063824


In [15]:
non_correlated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2550 non-null   float64
 1   1       2550 non-null   float64
 2   2       2550 non-null   float64
 3   3       2550 non-null   float64
 4   5       2550 non-null   float64
 5   6       2550 non-null   float64
dtypes: float64(6)
memory usage: 119.7 KB


In [16]:
non_correlated_data.describe()

Unnamed: 0,0,1,2,3,5,6
count,2550.0,2550.0,2550.0,2550.0,2550.0,2550.0
mean,0.492941,0.507059,0.508235,252.348506,89.852496,175.96188
std,0.500048,0.500048,0.50003,146.556638,35.302143,125.695888
min,0.0,0.0,0.0,0.050706,29.013959,-113.121946
25%,0.0,0.0,0.0,128.646969,59.582918,85.310735
50%,0.0,1.0,1.0,248.703167,89.919219,159.972378
75%,1.0,1.0,1.0,380.42592,119.406005,256.186915
max,1.0,1.0,1.0,511.547524,151.990754,545.342187


In [17]:
X_features = non_correlated_data.iloc[:, :-1].values
y_predicted = non_correlated_data.iloc[:, -1].values
print(y_predicted)

[ 75.18180738  93.01158133  -1.58863177 ... 187.1155391  265.6732347
 352.7798772 ]


## Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_features = sc.fit_transform(X_features)

## Splitting the dataset into the Training set and Test set

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split (X_features, y_predicted, test_size = 0.20, random_state=42)

In [20]:
from sklearn import preprocessing
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(Y_train)
print(Y_train)

[ 35.59668763 169.7109423  111.9011765  ... 362.3678406  412.5520636
 137.0461167 ]


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from xgboost.sklearn import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()),('XGB', XGBRegressor())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledSVR', Pipeline([('Scaler', StandardScaler()),('SVR', SVR())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler()),('DT', DecisionTreeRegressor())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



ScaledLR: -1256.504564 (140.262848)
ScaledLASSO: -1261.033782 (143.775453)
ScaledEN: -2975.418745 (366.749553)
ScaledXGB: -229.598272 (29.218381)




ScaledKNN: -208.601387 (22.225857)
ScaledCART: -354.583473 (31.354199)
ScaledSVR: -2052.753292 (425.153801)




ScaledGBM: -229.670129 (27.960958)
ScaledDT: -347.430015 (27.724121)




ScaledRF: -205.108723 (20.277402)


### From the output above, it looks like the Scaled RF performs the best using a scaled version of the data. From this point onward, I will build the regression algorithm using the RF for regression. I will also implement KNN regression which was the second most efficient algorithm. I will also implement XGBoost which was the third most efficient algorithm.