<h1>Without the fancy charts</h1><br>
<h2>Load the data</h2>
The table shows the top 5 rows...


In [2]:
import pandas as pd
def loadData(csvLoc): 
    return pd.read_csv(csvLoc)
dSet=loadData("housing.csv")
dSet.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


...and a quick overview for sanity purposes


In [3]:
dSet.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


<h2> Split the data </h1>
We are making a stratified random split for this


In [4]:
import numpy as numpty
from sklearn.model_selection import StratifiedShuffleSplit

def chopper (data, testPortion):
    
    #make a categoricl attribute out of median income
    data["income_cat"]= pd.cut(data["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., numpty.inf], labels=[1, 2, 3, 4, 5])
    
    #make arrays of shuffled indices from the dataset that both have the same distribution
    split=StratifiedShuffleSplit(n_splits=1,test_size=testPortion, random_state=42)
    
    #add the rows from each array to the respective output arrays
    for train_index, test_index in split.split(data, data["income_cat"]):
        testSet=data.loc[test_index]
        trainSet=data.loc[train_index]
        
     #remove the categoric attribute we created earlier   
    for set_ in (testSet,trainSet):
        set_.drop("income_cat",axis=1,inplace=True)        
    #victory!
    return testSet, trainSet    
            
strat_test_set, strat_train_set = chopper(dSet,.2)


overview of the training set - looks right to me...

In [22]:
strat_train_set.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16354 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
median_house_value    16512 non-null float64
ocean_proximity       16512 non-null object
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


cleaning data

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import  Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

#drop the median house value and copy it into a set of labels before we start cleaning data
housing=strat_train_set.drop("median_house_value", axis=1)
housing_labels=strat_train_set["median_house_value"].copy()

#make a Transformer class to add some more attributes
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

rooms_ix,bedrooms_ix, population_ix, households_ix=3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):

    def __init__(self,add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self,X,y=None):
        return self

    def transform(self, X, y=None):
        #claculate the extra attributes
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        #claculate the optional one if needed
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
#set up a pipeline to process the numeric attributes using the new class and others    
num_pipe = Pipeline([('imputer',SimpleImputer(strategy="median")),
                    ('attribs_adder',CombinedAttributesAdder()),
                    ('std_scaler',StandardScaler()),                    
                    ])

#split off the numeric attributes...
housingNumeric = housing.drop("ocean_proximity", axis=1)
#...Store the column headings
numeric=list(housingNumeric)
#...same for the categorical attributes (1 of them)
categoric=["ocean_proximity"]

#set up the pipeline to output the complete transformation
full_pipe = ColumnTransformer([
    ("num",num_pipe,numeric),
    ("cat",OneHotEncoder(),categoric),
    ])
#execute the pipeline to produce prepared training data
trainingDataPrepped=full_pipe.fit_transform(housing)
trainingDataPrepped.shape

(16512, 16)

now to fit a model


In [19]:
from sklearn.linear_model import LinearRegression
type(housing_labels)
type(trainingDataPrepped)
trainingDataPrepped 
linReg=LinearRegression()
linReg.fit(trainingDataPrepped,housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

OK we has dun a AI!<br>
now how accurate is it?

In [7]:
from sklearn.metrics import mean_squared_error
exampleData = housing.iloc[:5]
exampleLabels = housing_labels.iloc[:5]
preppedExamples=full_pipe.transform(exampleData)
print("predictions: ",linReg.predict(preppedExamples))
print("Lables: ",list(exampleLabels))

predictions = linReg.predict(trainingDataPrepped)
linRegMSE = mean_squared_error(housing_labels,predictions)
linRegRMSE=np.sqrt(linRegMSE)
linRegMSE


predictions:  [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Lables:  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


4709829587.971121

not brilliant, but better than the book!? <br>
Lets try a decision tree...

In [8]:
from sklearn.tree import DecisionTreeRegressor 
dtReg=DecisionTreeRegressor()
dtReg.fit(trainingDataPrepped,housing_labels)


dtPredictions=dtReg.predict(trainingDataPrepped)
dtMSE=mean_squared_error(housing_labels,dtPredictions)
dtRMSE=np.sqrt(dtMSE)
dtRMSE

0.0

Bollocks! There is no way it is perfect! There is a whiff of over fitting here<br/>
Let's do some cross-validation, i.e. splitting the training set up into smaller training and test sets. 
We are using k-fold validation which splits the traing data into 10 subsets, using each one in turn for validation and training on the other 9


In [10]:
from sklearn.model_selection import cross_val_score
#gets an array of scores mean squared errors
scores = cross_val_score(dtReg, trainingDataPrepped, housing_labels, scoring="neg_mean_squared_error", cv=10)
#Scikit-Learn’s cross-validation features expect a utility function (greater is better) rather than a cost function (lower is better), 
# so the scoring function is actually the opposite of the MSE (i.e., a negative value), we compute -scores before calculating the square root.
tree_rmse_scores = np.sqrt(-scores)

def getYerScoresOut(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
getYerScoresOut(tree_rmse_scores)

Scores: [68727.83857677 66403.6676539  70529.42529971 69289.29044142
 71539.99347772 75225.57748434 71288.92293203 70401.40948923
 77459.14781861 69298.78004939]
Mean: 71016.40532231268
Standard deviation: 3043.647317294521




ok lets do a random forest


In [11]:
from sklearn.ensemble import RandomForestRegressor

r_ForestRegressor=RandomForestRegressor()
r_ForestRegressor.fit(trainingDataPrepped,housing_labels)
rf_Predictions=r_ForestRegressor.predict(trainingDataPrepped)

rf_MSE=mean_squared_error(housing_labels,rf_Predictions)
rf_RMSE=np.sqrt(rf_MSE)
rf_MSE





509438859.79480076

so this is different from the book again<br>
try putting the prepped data into a data frame



In [16]:
housing_cat = housing[["ocean_proximity"]]
oneHot=OneHotEncoder()
oneHot.fit(housing_cat)


columns=list(housingNumeric)
columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']