In [8]:
import pandas as pd
import numpy as np
import sklearn

In [9]:
URL = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv'
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop('median_house_value', axis=1)
y = train_set['median_house_value']

X_num = X_train.drop('ocean_proximity', axis=1)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_IX, bedrooms_IX, population_IX, households_IX = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_IX]/X[:, households_IX]
        population_per_household = X[:, households_IX]/X[:, population_IX]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_IX]/X[:, rooms_IX]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attributers', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])


In [13]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [14]:
X_prepared = full_pipeline.fit_transform(X_train)

In [15]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [17]:
LR_model.fit(X_prepared, y)

In [20]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1796,-122.35,37.93,39.0,2002.0,416.0,1166.0,395.0,1.7257,NEAR BAY
7913,-118.08,33.88,27.0,923.0,186.0,1014.0,204.0,3.825,<1H OCEAN
18594,-122.09,37.11,32.0,2637.0,489.0,1031.0,410.0,3.6474,NEAR OCEAN
4139,-118.18,34.12,29.0,2640.0,737.0,1795.0,655.0,2.369,<1H OCEAN
16319,-121.33,37.99,15.0,4472.0,1079.0,1837.0,976.0,2.5,INLAND
8107,-118.21,33.8,44.0,1387.0,280.0,984.0,302.0,4.25,NEAR OCEAN
4136,-118.18,34.12,52.0,1081.0,311.0,904.0,283.0,1.9219,<1H OCEAN
4901,-118.25,34.01,43.0,1429.0,386.0,1412.0,354.0,1.3287,<1H OCEAN
1585,-122.0,37.84,16.0,7681.0,946.0,2777.0,908.0,9.5271,<1H OCEAN
12497,-121.45,38.56,52.0,3170.0,476.0,1027.0,457.0,4.63,INLAND


In [21]:
test_label = y.loc[test_data.index]
test_label

1796      91500.0
7913     159500.0
18594    231600.0
4139     173400.0
16319    175900.0
8107     143100.0
4136     165100.0
4901     107200.0
1585     500001.0
12497    233800.0
Name: median_house_value, dtype: float64

In [24]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels

array([136747.96202757, 164108.51533823, 248635.45616908, 172570.95038695,
       158169.64837829, 234941.00596258, 160564.32755567, 111069.03192125,
       431658.40896473, 228105.91030611])

In [28]:
tp_data = pd.DataFrame({"Actual": test_label, "Predicted": predicted_labels})

In [30]:
import matplotlib.pyplot as plt



AttributeError: 'numpy.ndarray' object has no attribute 'Actual'