In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE

In [66]:
df=pd.read_csv("NPPE1_ModelBuilding3.csv")

In [67]:
train,test=train_test_split(df,test_size=0.3, random_state=42)

In [68]:
X_train=train.iloc[:,:-1]
y_train=train.iloc[:,-1]
X_test=test.iloc[:,:-1]
y_test=test.iloc[:,-1]

In [69]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2800, 14), (2800,), (1200, 14), (1200,))

In [70]:
ridge=Ridge(alpha=10,solver="saga",tol=1e-4,random_state=42)

In [71]:
ridge.fit(X_train, y_train)
y_pred=ridge.predict(X_test)

In [75]:
r2_score(y_test,y_pred)

0.6613547575262211

In [76]:
intercept=ridge.intercept_
intercept

30.28690798037617

In [77]:
coeff=ridge.coef_
coeff

array([ -0.4825441 ,   3.74601838,  -0.73583331,   0.54199933,
        -9.89014109,   5.80114296,  -5.06099736,  -9.45015598,
         4.73124885, -23.51321982,  11.31863371,   0.49450664,
        -0.89196134,   0.89196134])

In [78]:
sgd=SGDRegressor(random_state=42)
param_grid={"penalty":["l1","l2"],
            "alpha":[1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
            "tol":[1e-4, 1e-3, 1e-2, 1e-1]}
grid_search=GridSearchCV(estimator=sgd,
                         param_grid=param_grid,
                         scoring="neg_mean_absolute_error",
                         cv=5)
grid_search.fit(X_train,y_train)

In [79]:
best_param=grid_search.best_params_
best_param

{'alpha': 0.001, 'penalty': 'l2', 'tol': 0.0001}

In [80]:
best_model=grid_search.best_estimator_
y_pred=best_model.predict(X_test)

In [81]:
mean_absolute_error(y_pred,y_test)

3.8131121797994014

In [85]:
lasso=Lasso()
pca=PCA()

In [112]:
param_grid={"pca__n_components":[0.9, 0.95],
            "lasso__alpha":[10, 1, 0.01, 0.001]}

In [113]:
pipe=Pipeline([
                ("pca",PCA()),
                ("lasso",Lasso())
              ])

In [114]:
grid_search2=GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring="neg_mean_absolute_error",
                         cv=5,
                         n_jobs=-1)

In [115]:
grid_search2.fit(X_train,y_train)

In [118]:
best_param=grid_search2.best_params_
best_param

{'lasso__alpha': 0.01, 'pca__n_components': 0.95}

In [119]:
best_model=grid_search2.best_estimator_

In [125]:
best_model.fit(X_train,y_train)

In [126]:
y_pred=best_model.predict(X_test)

In [128]:
r2_score(y_test,y_pred)

0.6288625430197575

In [134]:
pca_step = best_model.named_steps['pca']

In [135]:
explained_variance_ratio = pca_step.explained_variance_ratio_

In [137]:
variance_first_pc = explained_variance_ratio[0]
variance_first_pc

0.6993757201670407

In [144]:
pipe2=Pipeline([
                ("Poly",PolynomialFeatures(interaction_only=False,degree=2)),
                ("lasso",Lasso(alpha=1, warm_start=True,random_state=0))
])

In [146]:
pipe2.fit(X_train,y_train)

In [147]:
y_pred=pipe2.predict(X_test)

In [148]:
r2_score(y_test,y_pred)

0.157678032410551

In [151]:
lin_reg=LinearRegression()
selector=RFE(estimator=lin_reg, n_features_to_select=X_train.shape[1] - 1)
selector = selector.fit(X_train, y_train)
selector.ranking_

array([1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])