In [26]:
import pandas as pd
from sklearn import datasets
[attr for attr in dir(datasets) if not attr.startswith('_')]

['base',
 'california_housing',
 'clear_data_home',
 'covtype',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_kddcup99',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_mldata',
 'fetch_olivetti_faces',
 'fetch_rcv1',
 'fetch_species_distributions',
 'get_data_home',
 'kddcup99',
 'lfw',
 'load_boston',
 'load_breast_cancer',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_linnerud',
 'load_mlcomp',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_checkerboard',
 'make_circles',
 'make_classification',
 'make_friedman1',
 'make_friedman2',
 'make_friedman3',
 'make_gaussian_quantiles',
 'make_hastie_10_2',
 'make_low_rank_matrix',
 'make_moons',
 'make_multilabel_classification',
 'make_regression',
 'make_s_curve',
 'make_sparse_coded_signal',
 'make_sparse_spd_matrix',
 'make_

In [4]:
boston = datasets.load_boston()

In [5]:
print boston.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [27]:
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df.head()
# set X y
X = boston.data
y = boston.target

In [7]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [8]:
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR

In [9]:
regressors=[
    LinearRegression(),
    RANSACRegressor(),
    GaussianProcessRegressor(),
    KNeighborsRegressor(n_neighbors=9, metric='manhattan'),
    SVR(),
    LinearSVR(),
    SVR(kernel='linear')
]

In [17]:
for model in regressors:
    predictions = cross_val_predict(model, X, y, cv=10)
    print(model)
    print '\tExplained variance: %.2f' %explained_variance_score(y, predictions)
    print '\tMean absolute error: %.2f' %mean_absolute_error(y, predictions)
    print '\tR2 score: %.2f' %r2_score(y, predictions)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
	Explained variance: 0.59
	Mean absolute error: 4.00
	R2 score: 0.59
RANSACRegressor(base_estimator=None, is_data_valid=None, is_model_valid=None,
        loss='absolute_loss', max_skips=inf, max_trials=100,
        min_samples=None, random_state=None, residual_metric=None,
        residual_threshold=None, stop_n_inliers=inf, stop_probability=0.99,
        stop_score=inf)
	Explained variance: 0.01
	Mean absolute error: 5.19
	R2 score: -0.08
GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None,
             n_restarts_optimizer=0, normalize_y=False,
             optimizer='fmin_l_bfgs_b', random_state=None)
	Explained variance: 0.00
	Mean absolute error: 22.52
	R2 score: -6.01
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=1, n_neighbors=9, p=2,
          weights='uniform')
	Explained variance: -0.03
	Mean absolute error: 6.45
	R2 sco

In [20]:
from sklearn.model_selection import GridSearchCV
knr = KNeighborsRegressor()

parameters = {'n_neighbors': [5,6,7,8,9,10,11,12],
              'weights': ['uniform', 'distance'],
              'metric': ['minkowski', 'chebyshev', 'manhattan']
             }
grid =  GridSearchCV(knr, parameters)
model = grid.fit(X, y)
print(model)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12], 'metric': ['minkowski', 'chebyshev', 'manhattan'], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)


In [21]:
predictions = cross_val_predict(model, X, y, cv=10)
print 'Explained variance: %.2f' %explained_variance_score(y, predictions)
print 'Mean absolute error: %.2f' %mean_absolute_error(y, predictions)
print 'R2 score: %.2f' %r2_score(y, predictions)

Explained variance: -0.10
Mean absolute error: 6.69
R2 score: -0.10


In [22]:
from sklearn import cluster
spectral = cluster.SpectralClustering(
        n_clusters=4, eigen_solver='arpack',
        affinity="nearest_neighbors")

In [28]:
spectral.fit(boston.data)
boston_df['category'] = spectral.labels_
boston_df['price'] = boston.target
house_clusters = boston_df.groupby('category').mean().sort_values('price')
house_clusters.index = ['low', 'mid_low', 'mid_high', 'high']
house_clusters[['price', 'CRIM', 'RM', 'AGE', 'DIS']]

Unnamed: 0,price,CRIM,RM,AGE,DIS
low,12.8,15.689918,6.103088,89.835294,2.000068
mid_low,17.418447,11.082796,5.974233,90.01165,2.072428
mid_high,21.2125,0.894798,6.087675,82.5075,2.480024
high,25.86609,0.248699,6.471142,54.576817,4.984178
