In [265]:
import pandas as pd 
import seaborn as sns 
import tensorflow as tf
import numpy as np
import math
import shutil 
import pickle


from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, RegressorMixin,ClassifierMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import fbeta_score, make_scorer

In [266]:
df= pd.read_csv('data/housing.csv')

In [267]:
#np.unique(df['total_bedrooms'])
df= df.dropna()

In [268]:
df['num_rooms']= df['total_rooms'] / df['households']
df['num_bedroom']= df['total_bedrooms'] / df['households']
df['pers_per_house']= df['population'] / df['households']

df.drop(['total_bedrooms', 'total_rooms','population', 'households'], axis =1 , inplace=True)

In [269]:
df.loc[df['ocean_proximity'] == 'NEAR BAY', 'ocean_proximity'] = 1
df.loc[df['ocean_proximity'] == 'NEAR OCEAN', 'ocean_proximity'] = 2
df.loc[df['ocean_proximity'] == 'ISLAND', 'ocean_proximity'] = 3
df.loc[df['ocean_proximity'] == '<1H OCEAN', 'ocean_proximity'] = 4
df.loc[df['ocean_proximity'] == 'INLAND', 'ocean_proximity'] = 5

In [270]:
X = df[['housing_median_age','num_rooms','num_bedroom','median_income','ocean_proximity', 'pers_per_house']]
y = df["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Fit & transform the train split and just transform the test split
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))
y_test_scaled = y_scaler.transform(np.array(y_test).reshape(-1, 1))

In [271]:
featcols = {
  colname : tf.feature_column.numeric_column(colname) \
    for colname in 'housing_median_age,median_income,num_rooms,num_bedroom,pers_per_house,ocean_proximity'.split(',')
}
# Bucketize lat, lon so it's not so high-res; California is mostly N-S, so more lats than lons
featcols['longitude'] = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('longitude'),
                                                   np.linspace(-124.3, -114.3, 5).tolist())
featcols['latitude'] = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'),
                                                  np.linspace(32.5, 42, 10).tolist())

# Split into train and eval
msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]

SCALE = 100000

BATCH_SIZE=100

train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x = traindf[list(featcols.keys())],
                                                               y =(traindf["median_house_value"]/100),
                                                               batch_size= BATCH_SIZE, shuffle=True)


eval_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x = evaldf[list(featcols.keys())],
                                                    y = evaldf["median_house_value"] / SCALE,  # note the scaling
                                                    num_epochs = 10, 
                                                    batch_size = len(evaldf), 
                                                    shuffle=False)
print('# of Rows, Columns: ',df.shape)
print(df.head(15))

# of Rows, Columns:  (20433, 9)
    longitude  latitude  housing_median_age  median_income  \
0     -122.23     37.88                41.0         8.3252   
1     -122.22     37.86                21.0         8.3014   
2     -122.24     37.85                52.0         7.2574   
3     -122.25     37.85                52.0         5.6431   
4     -122.25     37.85                52.0         3.8462   
5     -122.25     37.85                52.0         4.0368   
6     -122.25     37.84                52.0         3.6591   
7     -122.25     37.84                52.0         3.1200   
8     -122.26     37.84                42.0         2.0804   
9     -122.25     37.84                52.0         3.6912   
10    -122.26     37.85                52.0         3.2031   
11    -122.26     37.85                52.0         3.2705   
12    -122.26     37.85                52.0         3.0750   
13    -122.26     37.84                52.0         2.6736   
14    -122.26     37.85               

In [272]:
SCALE = 100000
def print_rmse(model, name, input_fn):
    metrics=model.evaluate(input_fn=input_fn, septs=1)
    print("RMES sur {}dataset={} USD".format(name, np.sqrt(metrics['average_loss'])*SCALE))

In [273]:
# Train
with open("data/X_train_scaled.pickle", 'rb') as xtrain, open("data/y_train_scaled.pickle", 'rb') as ytrain:
    X_train, y_train = pickle.load(xtrain), pickle.load(ytrain)

# Test
with open("data/X_test_scaled.pickle", 'rb') as xtest, open("data/y_test_scaled.pickle", 'rb') as ytest:
    X_test, y_test = pickle.load(xtest), pickle.load(ytest)

In [274]:
class BR(BaseEstimator,RegressorMixin):

    # instantiation values
    def __init__(self,lamd=1.0e-5,alph=1e-5,maxiter=2000,rtol=1.0e-5,verbose=True):    
        self.maxiter = maxiter # class contains only tunable hyperparameters (max convergence iteration)
        self.rtol = rtol       # convergence tolerance for hyperparameters
        self.lamd = lamd   # hyperparameter 
        self.alph = alph     # hyperparameter
        self.verbose = verbose # can be activated to check coverged hyperparameters
    
    # compute mean cofficients/covariance matrix of posterior mean
    @staticmethod
    def posterior(X,y,lamd,alph):
        ndim = X.shape[1]
        S_N_inv = lamd * np.eye(ndim) + alph * X.T.dot(X) 
        S_N = inv(S_N_inv)                                      
        m_N = alph * S_N.dot(X.T).dot(y)                
        return m_N, S_N
    
    ''' train a bayesian ridge regression model + nearest classification '''
    
    def fit(self,X,y):

        ''' A. Check Input Data Copatibility '''
        if(type(X) is np.ndarray):
            self.X = X;self.y = y
        else:
            self.X = X.values; self.y = y.values
        ntot,ndim = self.X.shape

        # set initial value for hyperparameters
        eig0 = np.linalg.eigvalsh(self.X.T.dot(self.X))  # diagonal component (ndim,)

        # tune hyperparameters via convergence tolerance.
        for niter in range(self.maxiter):

            alph1 = self.alph
            lamd1 = self.lamd
            eig = eig0*self.alph

            # make prediction on training data
            self.m_N, self.S_N = self.posterior(self.X,self.y,self.lamd,self.alph)

            gamma = np.sum(eig/(eig+self.lamd))
            self.lamd = gamma / np.sum(self.m_N ** 2)
            Ibeta = 1.0 / (ntot-gamma) * np.sum((self.y - self.X.dot(self.m_N)) ** 2)
            self.beta = 1.0/Ibeta

            # define exit condition
            if np.isclose(lamd1,self.lamd,self.rtol) and np.isclose(alph1,self.alph,self.rtol):
                if(self.verbose is True):
                    print(f'{self.rtol} achieved at {niter+1} iterations.')
                    print(f'Converged Hyperparameters: {self.lamd,self.alph}')
                return self

        return self

    ''' make new predictions; mean + variance of posterior predictive distribution '''
    
    def predict(self,X):
        if(type(X) is np.ndarray):
            self.X = X
        else:
            self.X = X.values
        self.mu_s = X.dot(self.m_N)
        self.cov_s = 1.0 / self.alph + np.sum(X.dot(self.S_N) * X, axis=1)
        return self.mu_s

In [275]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [276]:
# Model Evaluation w/ Cross Validation
def modelEval(ldf,feature='median_house_value',model_id = 'dummy'):
    
    # Input: Feature & Target DataFrame

    # Split feature/target variable
    y = df[feature].copy()
    X = df.copy()
    del X[feature]
   # remove target variable
    
    # Pick Model 
   #if(model_id is 'dummy'):  
    model = DummyRegressor()
   #if(model_id is 'br'):   
    #model = BR(verbose=False)  
    #model = RandomForestRegressor(n_estimators=10,random_state=10)
    
    ''' Parameter Based Cross Validation (No Pipeline)'''
    #gscv = GridSearchCV(model,param_grid,cv=5)
    #gscv.fit(X,y)
    #results = pd.df(gscv.cv_results_)
    #scores = np.array(results.mean_test_score).reshape(7,7)
    
#     # plot the cross validation mean scores
    #heatmap1(scores,xlabel='lamda',xticklabels=param_grid['lamd'],
          #   ylabel='alpha',yticklabels=param_grid['alph'])
    
    ''' Standard Cross Validation '''
    cv_score = np.sqrt(-cross_val_score(model,X,y,cv=5,scoring='neg_mean_squared_error'))
    print("Scores:",cv_score);
    print("Mean:", cv_score.mean());
    print("std:", cv_score.std())

In [277]:
trdata= ['X_train, X_test, y_train, y_test']
modelEval(trdata, model_id='dummy')
#print('Score: '+ str(r2_score(y,X)))

Scores: [114361.51883352 109621.68676131 125662.15311405 112298.79716652
 122776.84571602]
Mean: 116944.20031828254
std: 6195.024974465456


In [278]:
from sklearn.model_selection import train_test_split

X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.33,
                                                   random_state=42)
#  splitting the data into train and test

In [279]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression() #  importing LinearRegression

reg.fit(X_train, y_train)

LinearRegression()

In [280]:
predictions=reg.predict(X_test)

In [281]:
print(f'actual:{y_test.mean()}')
print(f'prediction: {predictions.mean()}')

actual:205677.60803796528
prediction: 206280.9631702541


In [282]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('MAE: ' + str(mean_absolute_error(y_test, predictions)))
print('MSE: ' + str(mean_squared_error(y_test, predictions)))
print('Score: '+ str(r2_score(y_test, predictions)))

MAE: 52360.19190552398
MSE: 5174333830.004562
Score: 0.6097593196371391


In [283]:
from sklearn.model_selection import cross_validate
cross_validate(reg, X, y, cv=8, scoring=dict(r2=make_scorer(r2_score), e2=make_scorer(mean_squared_error)),
              return_train_score=False)

{'fit_time': array([0.06678104, 0.03797865, 0.03697824, 0.03498077, 0.03298163,
        0.02998328, 0.02298689, 0.02498651]),
 'score_time': array([0.01599097, 0.00999427, 0.00999355, 0.00999331, 0.00899482,
        0.00699544, 0.00799489, 0.0059967 ]),
 'test_r2': array([0.38451201, 0.52563889, 0.52378341, 0.56264329, 0.61075479,
        0.4400234 , 0.45522649, 0.72493079]),
 'test_e2': array([5.84429972e+09, 6.25733707e+09, 5.95505789e+09, 6.44680767e+09,
        4.58250875e+09, 4.47184498e+09, 7.73444381e+09, 3.96144229e+09])}

In [284]:
from sklearn.ensemble import RandomForestRegressor
Reg_model= RandomForestRegressor()

In [285]:
Reg_model.fit(X_train, y_train)

RandomForestRegressor()

In [286]:
actual = y_test
predictions= Reg_model.predict(X_test)
print(f'Actual mean:{np.mean(actual)}')
print(f'Prediction mean: {np.mean(predictions)}')

Actual mean:205677.60803796528
Prediction mean: 206253.72087349842


In [287]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('MAE: ' + str(mean_absolute_error(y_test, predictions)))
print('MSE: ' + str(mean_squared_error(y_test, predictions)))
print('Score: '+ str(r2_score(y_test, predictions)))

MAE: 32338.340965445648
MSE: 2496663663.0347157
Score: 0.811705282552471


In [288]:
from sklearn.model_selection import cross_validate
cross_validate(Reg_model, X, y, cv=5, scoring=dict(r2=make_scorer(r2_score), e2=make_scorer(mean_squared_error)),
              return_train_score=False)

{'fit_time': array([12.83272982, 12.03180861, 12.00187969, 10.8289535 , 11.16567826]),
 'score_time': array([0.07813883, 0.08726931, 0.09874892, 0.07812381, 0.09374785]),
 'test_r2': array([0.4398467 , 0.64595424, 0.73888425, 0.4451187 , 0.72533808]),
 'test_e2': array([6.02211259e+09, 4.15206530e+09, 3.76569002e+09, 6.51764046e+09,
        3.98682880e+09])}

In [289]:
print('Score: '+ str(r2_score(y_test, predictions)))

Score: 0.811705282552471


In [290]:
from tensorflow.keras.models import Sequential 
from keras.layers.core import Dense

# # define the model
# def larger_model():
#     # create model
#     model = Sequential()
#     model.add(Dense(10, input_dim=10, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(5, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(1, kernel_initializer='normal'))
#     # Compile model
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     return model

def build_and_compile_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
              optimizer=tf.keras.optimizers.Adam(0.001))
    return model


In [291]:
def DNN_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [292]:
X_train.isna().sum()

longitude             0
latitude              0
housing_median_age    0
median_income         0
ocean_proximity       0
num_rooms             0
num_bedroom           0
pers_per_house        0
dtype: int64


# train_features = X_train
# test_features = X_test

# train_labels=y_train
# test_labels=y_test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Fit & transform the train split and just transform the test split
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))
y_test_scaled = y_scaler.transform(np.array(y_test).reshape(-1, 1))

house_normalizer = X_scaler, y_scaler
house_normalizer.adapt(house)
# layer= layers.Normalization()
# layer.adapt(house)
# normalized_data = layer(house)

house_model = tf.keras.Sequential([
    house_normalizer,
    layers.Dense(units=1)
])

house_model.summary()

dnn_model = build_and_compile_model()

dnn_house_model.summary()

!conda install LSTM 
from tensorflow.keras import layers
from tensorflow import LSTM
model = Sequential()

model.add(LSTM(128, activation='relu',
               input_shape=(1000, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.Adam(lr=1e-3, decay=1e-5)

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=3, validation_data=(x_test, y_test))

%%time
history = dnn_house_model.fit(
    train_features['housing_median_age','median_income','num_rooms','num_bedroom','pers_per_house','ocean_proximity'],
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

plot_loss(history)

In [None]:
x = tf.linspace(0.0, 250, 251)
y = dnn_house_model.predict(x)

plot_house(x, y)

In [None]:
test_results['dnn_house_model'] = dnn_house_model.evaluate(
    test_features['median_house_value'], test_labels,
    verbose=0)

In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

In [None]:
plot_loss(history)

In [None]:
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

In [None]:
dnn_model.save('dnn_model')

#from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

# evaluate model with standardized dataset
estimators = []
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=10, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=8)
results = cross_val_score(pipeline, X, y, cv=kfold)
#print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

