In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [2]:
# Ignoring Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing Data
df = pd.read_csv('data.csv')

In [4]:
# Data
df.head()

Unnamed: 0,ID,T_degC,Salnty
0,0,10.5,33.44
1,1,10.46,33.44
2,2,10.46,33.437
3,3,10.45,33.42
4,4,10.45,33.421


In [5]:
# Data Set Dimensions
df.shape

(864863, 3)

In [6]:
# Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864863 entries, 0 to 864862
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      864863 non-null  int64  
 1   T_degC  853900 non-null  float64
 2   Salnty  817509 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 19.8 MB


In [7]:
# Null Count in Data Set
df.isnull().sum()

ID            0
T_degC    10963
Salnty    47354
dtype: int64

In [8]:
# Dropping ID Column
df = df.drop('ID', axis=1)

In [9]:
# Dropping Null Values
df.dropna(axis=0, how='any', inplace=True)

In [10]:
# Null Count in Data Set after dropping Null
df.isnull().sum()

T_degC    0
Salnty    0
dtype: int64

In [11]:
# Dimensions of Dataset after removing all Null containing Rows
df.shape

(814247, 2)

In [12]:
# Resetting Index
df.reset_index(drop=True, inplace=True)

In [13]:
# Dataset
df.head()

Unnamed: 0,T_degC,Salnty
0,10.5,33.44
1,10.46,33.44
2,10.46,33.437
3,10.45,33.42
4,10.45,33.421


In [14]:
# Feature Set and Class Set
X = df.iloc[0:, 0].values
y = df.iloc[0:, 1].values

In [15]:
# Reshaping Feature Set
X = X.reshape(-1, 1)

In [16]:
# Linear Regression
lin_reg = LinearRegression()

In [17]:
# Mean Square Error
mse = cross_val_score(lin_reg, X, y, scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
mean_mse

-0.1748181221229268

In [18]:
# Ridge Regression
ridge = Ridge()

In [19]:
# Performing Hyper-Parameter Tuning

# Setting Parameters
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 50, 65, 80, 90, 100]
}

# Performing Grid Search on Parameters
ridge_regressor = GridSearchCV(
    estimator=ridge, 
    param_grid=parameters, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, cv=5
)

# Fitting the Model on Data
ridge_regressor.fit(X, y)

GridSearchCV(cv=5, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 50, 65, 80, 90, 100]},
             scoring='neg_mean_squared_error')

In [20]:
# Best Fit Model Constraints
(ridge_regressor.best_params_, ridge_regressor.best_score_)

({'alpha': 100}, -0.17481798588184377)

In [21]:
# All Scores
ridge_regressor.cv_results_

{'mean_fit_time': array([0.09911413, 0.07379751, 0.07501688, 0.07629952, 0.07389874,
        0.07470636, 0.07350106, 0.06835427, 0.0734468 , 0.07871523,
        0.07485089, 0.06690221, 0.07063293, 0.0712584 , 0.07464323,
        0.07317266, 0.07428617]),
 'std_fit_time': array([0.03167128, 0.00406161, 0.003532  , 0.00440742, 0.00477866,
        0.00457188, 0.00334429, 0.00451548, 0.00411524, 0.0127064 ,
        0.01304947, 0.0065708 , 0.00509523, 0.00750298, 0.00782122,
        0.00630031, 0.00760377]),
 'mean_score_time': array([0.00942678, 0.01031041, 0.00819788, 0.0066153 , 0.0098196 ,
        0.00977859, 0.00990424, 0.01181722, 0.0099555 , 0.00860839,
        0.01030202, 0.01166348, 0.00911946, 0.01040111, 0.01106663,
        0.01017299, 0.00920286]),
 'std_score_time': array([0.00175882, 0.00012309, 0.00410057, 0.00461164, 0.00085765,
        0.00082052, 0.00077652, 0.00432142, 0.0013274 , 0.00184051,
        0.00598888, 0.00377783, 0.00116936, 0.0020587 , 0.0021952 ,
        0.00