In [1]:
import pandas as pd
from sklearn import datasets

In [3]:
#loading dataset
load_df = datasets.load_boston()

In [4]:
df = pd.DataFrame(load_df.data, columns=load_df.feature_names)

In [5]:
#we can apply background sttyle to dataframe not for series
df.head().style.background_gradient()

In [6]:
#adding target column
df['Price'] = load_df.target

In [7]:
df.head()

In [8]:
#checking for null values
df.info(memory_usage=False)

In [10]:
df.describe().style.background_gradient() #cmap='magma'

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
#checking the distribution of price column
sns.distplot(df.Price)

In [13]:
#correlation
cor = df.corr()

In [14]:
plt.figure(figsize=(12,6))
sns.heatmap(cor,annot=True,cmap='Blues')

There is no related features where relation greater than  0.85. 

In [24]:
#splitting the data 
X = df.iloc[:,:13]
y = df.Price

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=101, shuffle=True)

In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [80]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression 

In [81]:
models = {'Forest':RandomForestRegressor(),'Adaboost':AdaBoostRegressor(),'Gradient':GradientBoostingRegressor() ,
          'xgboost':XGBRegressor(), 'linear':LinearRegression()}

In [82]:
for model_name,model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'Mean Squared Error For {model_name}')
    print(mean_squared_error(y_test,y_pred))
    print(f'r2 Score For {model_name}')
    print(r2_score(y_test,y_pred))    
    print('')

# Model Tuning

In [71]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import math

In [50]:
bootstrap = [True, False]

n_estimators = [int(x) for x in np.linspace(start=80, stop=200, num=10)]

max_features = ['auto','sqrt']

max_depth = [int(x) for x in np.linspace(5,28,num=6)]

min_samples_leaf = [1,2,5,10]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 20, num = 18)]


random = {'n_estimators': n_estimators,
                 'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              }

print(random)

In [54]:
random_ = RandomizedSearchCV(RandomForestRegressor(), param_distributions=random,scoring='neg_mean_squared_error',cv=5,random_state=101,n_jobs=1)
random_.fit(X_train,y_train)

In [59]:
y_prd = random_.predict(X_test)
print(f'R2 Score for RandomForest')
print(r2_score(y_test,y_prd)*100)
print('')
print(f'Mean Squared Error for RandomForest')
print(mean_squared_error(y_test,y_prd))

In [60]:
#cheking the distribution for predicted (train) if dist is normal then our model and prediction are good 
sns.distplot(y_prd)

In [62]:
#plotting actual and predicted
plt.scatter(y_test,y_prd)
plt.xlabel('ACtual')
plt.ylabel('predicted values')

In [63]:
sns.distplot(y_train,color='r')
sns.distplot(y_prd,color='g')

In [65]:
#plotted ytest and predictions
sns.distplot(y_test-y_prd)

# Tuning XGBoost

In [66]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [68]:
boost =RandomizedSearchCV(XGBRegressor(),param_distributions=params,n_iter=5,scoring='neg_mean_squared_error',n_jobs=-1,cv=5)
boost.fit(X_train,y_train)

In [78]:
y_pred = boost.predict(X_test)
print(f'R2 Score for XGBoost')
print(r2_score(y_test,y_pred)*100)
print('')
mse = mean_squared_error(y_test,y_pred)
RMSE = math.sqrt(mse)
print(f"Root Mean Square Error", RMSE )


In [79]:
sns.distplot(y_test,color='r')
sns.distplot(y_pred,color='g')