## Imports

In [2]:
import sklearn
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC

## Obtain the data

In [3]:
cal = datasets.fetch_california_housing()
cal_data=cal.data
cal_target=cal.target

## Task 1. Regression. (3%)

### Task 1.a Linear model vs. GBT model

In [4]:
linreg = sklearn.linear_model.LinearRegression()

linreg.fit(cal_data,cal_target)

##score on the second half, use r2 scoring for regression
linear_performance = cross_val_score(linreg,cal_data,cal_target, cv=5, scoring='r2').mean()


print("With a linear regression model, %.2f%% of the data fit the model"%(linear_performance*100))

print("")
print("")

gbtreg = GradientBoostingRegressor()
gbtreg.fit(cal_data,cal_target)
gbt_performance = cross_val_score(gbtreg,cal_data,cal_target, cv=5, scoring='r2').mean()

print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(gbt_performance*100))

With a linear regression model, 55.30% of the data fit the model


With a gradient boosting tree regression model, 66.99% of the data fit the model


### Task 1.b Search for better parameters in GBT model

In [5]:
print("")
parameters={'learning_rate':[.1,.15,.2,.25],'max_depth':[2,3,4,5],'n_estimators':[50,80,100,200]}
gs_model=GridSearchCV(GradientBoostingRegressor(),parameters,scoring="r2",cv=5)
gs_model.fit(cal_data[:3000],cal_target[:3000])
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)




{'mean_fit_time': array([0.14665346, 0.22673244, 0.27997756, 0.56369224, 0.20268116,
       0.32730737, 0.41502032, 0.86586041, 0.26965899, 0.43215151,
       0.54970722, 1.10267205, 0.33338995, 0.5562964 , 0.68645225,
       1.39176822, 0.15834041, 0.24748878, 0.3029304 , 0.62919321,
       0.21944699, 0.35346737, 0.44704928, 0.89291315, 0.2869946 ,
       0.46365499, 0.58444443, 1.19415865, 0.35465865, 0.57647557,
       0.74034081, 1.48601079, 0.15578809, 0.25206447, 0.33269596,
       0.6322525 , 0.22429876, 0.36995964, 0.45761766, 0.93809252,
       0.30317168, 0.49485235, 0.61064482, 1.22580781, 0.36661615,
       0.59298105, 0.76606483, 1.51183543, 0.16214399, 0.26301661,
       0.32507825, 0.64407029, 0.23137727, 0.37369981, 0.46583171,
       0.93831687, 0.31058278, 0.49857502, 0.6189743 , 1.23076768,
       0.39021239, 0.61397414, 0.76423354, 1.5194869 ]), 'std_fit_time': array([0.00378927, 0.00543168, 0.00296838, 0.0072276 , 0.00289111,
       0.00534916, 0.00316909, 0.057

### Task 1.c 

In [6]:
print("At a glance, GBT is a better choice than linear regression. The ideal learning rate seems to be higher than the default. The number of estimators is ideally less, at 80 and the ideal max depth is 4. ")

At a glance, GBT is a better choice than linear regression. The ideal learning rate seems to be higher than the default. The number of estimators is ideally less, at 80 and the ideal max depth is 4. 


## Task 2. Classification. (3%)

In [7]:
##use binarized version of target values
y = [i > 2 for i in cal.target]

### Task 2.a Logistic regression vs. GBT

In [8]:
logreg = LogisticRegression()
logreg.fit(cal_data,y)

logistic_performance = cross_val_score(logreg,cal_data,y, cv=5, scoring='accuracy').mean()
print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(logistic_performance*100))
gbt_class=GradientBoostingClassifier()
gbt_class.fit(cal_data,y)
gbt_class_performance = cross_val_score(gbt_class,cal_data,y, cv=5, scoring='accuracy').mean()
print("With a gradient boosting tree regression model, %.2f%% of the data fit the model"%(gbt_class_performance*100))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

With a gradient boosting tree regression model, 78.62% of the data fit the model
With a gradient boosting tree regression model, 78.84% of the data fit the model


### Task 2.b Search for optimal parameters in GBT model

In [9]:
parameters={'learning_rate':[.25,.05,.1,.15],'max_depth':[1,2,3,4],'n_estimators':[25,50,80,100]}
gs_model=GridSearchCV(GradientBoostingClassifier(),parameters,scoring="accuracy",cv=5)
gs_model.fit(cal_data[:3000],y[:3000])
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)




{'mean_fit_time': array([0.04995699, 0.10271187, 0.16323581, 0.20538979, 0.08878207,
       0.17780266, 0.28451414, 0.344771  , 0.13024869, 0.2525053 ,
       0.40617466, 0.51654267, 0.16799817, 0.33625093, 0.5400856 ,
       0.67590065, 0.04937415, 0.09958248, 0.15943666, 0.19139724,
       0.09142232, 0.17515349, 0.28911138, 0.34990087, 0.12687182,
       0.25538616, 0.40731802, 0.5117548 , 0.16965466, 0.33239298,
       0.53185019, 0.66999717, 0.04938951, 0.10378284, 0.15766282,
       0.19829645, 0.09309378, 0.17420292, 0.28262873, 0.35932608,
       0.1335146 , 0.25929503, 0.41168652, 0.71290431, 0.22203531,
       0.43226199, 0.59894695, 0.82921133, 0.05917592, 0.10631442,
       0.17528763, 0.2654912 , 0.13157148, 0.29304276, 0.50246058,
       0.43394322, 0.1621315 , 0.31631436, 0.47837443, 0.72942595,
       0.21592498, 0.40977464, 0.61443076, 0.88016529]), 'std_fit_time': array([0.00241875, 0.00810638, 0.00867658, 0.01199436, 0.00535438,
       0.00535463, 0.01309885, 0.0061

### Task 2.c SVM


In [10]:
#decreasing C will prevent overfitting. Increasing C promotes underfitting
parameters={'C':[200,300,500,400,1000],'penalty':['l1','l2'],'tol':[1e-2,1e-3,1e-4,1e-5]}
gs_model=GridSearchCV(LinearSVC(),parameters,scoring="roc_auc",cv=5)
gs_model.fit(cal_data[:5000],y[:5000])
print("")
print(gs_model.cv_results_)
print("")
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)





Traceback (most recent call last):
  File "/Library/Python/3.7/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_classes.py", line 238, in fit
    self.loss, sample_weight=sample_weight)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_base.py", line 974, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_base.py", line 832, in _get_liblinear_solver_type
    % (error_string, penalty, loss, dual))
ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True

Traceback (most recent call last):
  File "/Library/Python/3.7/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit


{'mean_fit_time': array([0.00435057, 0.00446711, 0.00376472, 0.00320344, 0.22840586,
       0.27503138, 0.2717567 , 0.21954575, 0.00272794, 0.00208282,
       0.00201688, 0.00188336, 0.21987624, 0.21785893, 0.2189281 ,
       0.2134553 , 0.00191784, 0.00218701, 0.00233283, 0.00205884,
       0.21844554, 0.20605917, 0.24514985, 0.36635861, 0.00430422,
       0.00361419, 0.00336623, 0.00287561, 0.31761298, 0.25007629,
       0.22550769, 0.23486485, 0.0022964 , 0.00247054, 0.00195336,
       0.00215888, 0.23283439, 0.22697678, 0.22595973, 0.22154741]), 'std_fit_time': array([7.00969608e-04, 5.64703202e-04, 4.50693723e-04, 3.85243891e-04,
       2.29112854e-02, 5.34541900e-02, 4.03620375e-02, 1.05498454e-02,
       1.31264188e-03, 3.32586107e-04, 2.65446154e-04, 6.60576268e-05,
       2.14925173e-02, 1.60611298e-02, 9.21109284e-03, 1.51835663e-02,
       1.75920538e-04, 2.02207912e-04, 1.68367858e-04, 8.07632652e-05,
       9.35265839e-03, 9.55087525e-03, 5.59410806e-02, 4.37642447e-02,
 