## Imports

In [1]:
import sklearn
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC

## Obtain the data

In [2]:
cal = datasets.fetch_california_housing()
cal_data=cal.data
cal_target=cal.target

## Task 1. Regression. (3%)

### Task 1.a Linear model vs. GBT model

In [3]:
linreg = sklearn.linear_model.LinearRegression()

linreg.fit(cal_data,cal_target)

##score on the second half, use r2 scoring for regression
linear_performance = cross_val_score(linreg,cal_data,cal_target, cv=5, scoring='r2').mean()


print("With a linear regression model, %.2f%% of the data fit the model"%(linear_performance*100))

print("")
print("")

gbtreg = GradientBoostingRegressor()
gbtreg.fit(cal_data,cal_target)
gbt_performance = cross_val_score(gbtreg,cal_data,cal_target, cv=5, scoring='r2').mean()

print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(gbt_performance*100))

With a linear regression model, 55.30% of the data fit the model


With a gradient boosting tree regression model, 66.99% of the data fit the model


### Task 1.b Search for better parameters in GBT model

In [None]:
print("")
parameters={'learning_rate':[.1,.15,.2,.25],'max_depth':[2,3,4,5],'n_estimators':[50,80,100,200]}
gs_model=GridSearchCV(GradientBoostingRegressor(),parameters,scoring="r2",cv=5)
gs_model.fit(cal_data,cal_target)
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)


### Task 1.c 

In [None]:
print("At a glance, GBT is a better choice than linear regression. The ideal learning rate seems to be higher than the default. The number of estimators is ideally less, at 80 and the ideal max depth is 4. ")

## Task 2. Classification. (3%)

In [None]:
##use binarized version of target values
y = [i > 2 for i in cal.target]

### Task 2.a Logistic regression vs. GBT

In [None]:
logreg = LogisticRegression()
logreg.fit(cal_data,y)

logistic_performance = cross_val_score(logreg,cal_data,y, cv=5, scoring='accuracy').mean()
print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(logistic_performance*100))
gbt_class=GradientBoostingClassifier()
gbt_class.fit(cal_data,y)
gbt_class_performance = cross_val_score(gbt_class,cal_data,y, cv=5, scoring='accuracy').mean()
print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(gbt_class_performance*100))

### Task 2.b Search for optimal parameters in GBT model

In [None]:
parameters={'learning_rate':[.25,.05,.1,.15],'max_depth':[1,2,3,4],'n_estimators':[25,50,80,100]}
gs_model=GridSearchCV(GradientBoostingClassifier(),parameters,scoring="accuracy",cv=5)
gs_model.fit(cal_data[:3000],y[:3000])
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)



### Task 2.c SVM


In [None]:
#decreasing C will prevent overfitting. Increasing C promotes underfitting
parameters={'C':[200,300,500,400,1000],'penalty':['l1','l2'],'tol':[1e-2,1e-3,1e-4,1e-5]}
gs_model=GridSearchCV(LinearSVC(),parameters,scoring="roc_auc",cv=5)
gs_model.fit(cal_data[:5000],y[:5000])
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)



