In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

In [2]:
# Ignoring Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing Data
df = pd.read_csv('data.csv')

In [4]:
# Data
df.head()

Unnamed: 0,ID,T_degC,Salnty
0,0,10.5,33.44
1,1,10.46,33.44
2,2,10.46,33.437
3,3,10.45,33.42
4,4,10.45,33.421


In [5]:
# Data Set Dimensions
df.shape

(864863, 3)

In [6]:
# Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864863 entries, 0 to 864862
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      864863 non-null  int64  
 1   T_degC  853900 non-null  float64
 2   Salnty  817509 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 19.8 MB


In [7]:
# Null Count in Data Set
df.isnull().sum()

ID            0
T_degC    10963
Salnty    47354
dtype: int64

In [8]:
# Dropping ID Column
df = df.drop('ID', axis=1)

In [9]:
# Dropping Null Values
df.dropna(axis=0, how='any', inplace=True)

In [10]:
# Null Count in Data Set after dropping Null
df.isnull().sum()

T_degC    0
Salnty    0
dtype: int64

In [11]:
# Dimensions of Dataset after removing all Null containing Rows
df.shape

(814247, 2)

In [12]:
# Resetting Index
df.reset_index(drop=True, inplace=True)

In [13]:
# Dataset
df.head()

Unnamed: 0,T_degC,Salnty
0,10.5,33.44
1,10.46,33.44
2,10.46,33.437
3,10.45,33.42
4,10.45,33.421


In [14]:
# Feature Set and Class Set
X = df.iloc[0:, 0].values
y = df.iloc[0:, 1].values

In [15]:
# Reshaping Feature Set
X = X.reshape(-1, 1)

In [16]:
# Linear Regression
lin_reg = LinearRegression()

In [17]:
# Mean Square Error
mse = cross_val_score(lin_reg, X, y, scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
mean_mse

-0.1748181221229268

In [18]:
# Lasso Regression
lasso = Lasso()

In [19]:
# Performing Hyper-Parameter Tuning

# Setting Parameters
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 50, 65, 80, 90, 100]
}

# Performing Grid Search on Parameters
lasso_regressor = GridSearchCV(
    estimator=lasso, 
    param_grid=parameters, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, cv=5
)

# Fitting the Model on Data
lasso_regressor.fit(X, y)

GridSearchCV(cv=5, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 50, 65, 80, 90, 100]},
             scoring='neg_mean_squared_error')

In [20]:
# Best Fit Model Constraints
(lasso_regressor.best_params_, lasso_regressor.best_score_)

({'alpha': 0.01}, -0.17476260712980246)

In [21]:
# All Scores
lasso_regressor.cv_results_

{'mean_fit_time': array([1.49650655, 0.13202353, 0.1389502 , 0.14242501, 0.13798056,
        0.12810135, 0.12351098, 0.11774726, 0.12345796, 0.12473102,
        0.12780175, 0.121667  , 0.12279301, 0.13099332, 0.12935138,
        0.12981057, 0.12086096]),
 'std_fit_time': array([2.73420522e+00, 3.78464467e-03, 8.13457316e-03, 5.71230213e-03,
        7.96164004e-03, 8.23586721e-03, 4.27744831e-03, 4.56581215e-03,
        7.25805061e-03, 4.06207910e-03, 7.02719714e-03, 9.19060003e-03,
        1.67263967e-03, 5.81903380e-03, 1.03356811e-02, 3.91753878e-03,
        3.00653631e-03]),
 'mean_score_time': array([0.00830541, 0.01445956, 0.01193299, 0.01359372, 0.01062574,
        0.01386247, 0.01268706, 0.00946517, 0.01269774, 0.01024199,
        0.01069016, 0.01428671, 0.01646686, 0.01144233, 0.01436496,
        0.01178608, 0.01041288]),
 'std_score_time': array([4.17061081e-03, 4.85033724e-03, 3.19890667e-03, 4.15525856e-03,
        8.14306496e-04, 3.74638101e-03, 2.96569076e-03, 3.80767511e-