In [2]:
#Regression Metrics

In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score

# Load the California Housing dataset
california = fetch_california_housing()

# Extract features and target
X = california.data
y = california.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Check the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


X_train shape: (16512, 8)
X_test shape: (4128, 8)
y_train shape: (16512,)
y_test shape: (4128,)
Mean Squared Error: 0.5278524621633102


In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

knn_reg = KNeighborsRegressor()
param_dist = {'n_neighbors': list(range(3,20,1))}
rs = RandomizedSearchCV(knn_reg,param_dist,cv=10,n_iter=17)
rs.fit(X_train, y_train)
rs.best_score_

0.14859234211698963

In [9]:
from sklearn.linear_model import Ridge
cross_val_score(Ridge(),X_train,y_train,cv=10).mean()

0.6030848681950755

In [11]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
cross_val_score(GradientBoostingRegressor(max_depth=7),X_train,y_train,cv=10).mean()

0.8372895710178948

In [12]:
cross_val_score(RandomForestRegressor(),X_train,y_train,cv=10).mean()

0.807653808687444

In [14]:
param_dist = {'n_estimators': [4000], 'learning_rate': [0.01], 'max_depth':[1,2,3,5,7]}
rs_inst_a = RandomizedSearchCV(GradientBoostingRegressor(), param_dist, n_iter = 5, n_jobs=-1)
rs_inst_a.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
rs_inst_a.best_params_
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 4000}
rs_inst_a.best_score_

In [None]:
def mape_score(y_test, y_pred):
    return (np.abs(y_test - y_pred)/y_test).mean()

In [None]:
from __future__ import division
from numba import autojit

@autojit
def mape_score(y_test, y_pred):
    sum_total = 0
    y_vec_length = len(y_test)
    for index in range(y_vec_length):
        sum_total += (1 - (y_pred[index]/y_test[index]))
 
    return sum_total/y_vec_length

from sklearn.metrics import make_scorer
mape_scorer = make_scorer(mape_score, greater_is_better=False)

In [None]:
param_dist = {'n_estimators': [4000], 'learning_rate': [0.01], 'max_depth':[1,2,3,4,5]}
rs_inst_b = RandomizedSearchCV(GradientBoostingRegressor(), param_dist, n_iter = 3, n_jobs=-1,scoring = mape_scorer)
rs_inst_b.fit(X_train, y_train)
rs_inst_b.best_score_

In [25]:
rs_inst_b.best_params_

{'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 4000}