# Dependencies

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

# Preprocessing the Data

In [None]:
# read csv into dataframe
filepath = "Austin Properties 2019.1.6.csv"
csv_df = pd.read_csv(filepath)

In [None]:
# check the column names
csv_df.columns

In [None]:
csv_df = pd.get_dummies(data=csv_df, columns=['Washer/Dryer'])

In [None]:
# select certain variables
original_df = csv_df[['Bed', 'Bath', 'Avg SF', 'Concessions %', 'Year Built', 
   'Walk in Closet', 'Hardwood/Vinyl Floor',
   'Washer/Dryer_Yes but not in unit', 'Washer/Dryer_Yes in unit', 'Washer/Dryer_no',
   'Walk Score', 'Transit Score', '1 Mile Population', '1 Mile Median Household Income',
   'Miles from Domain', 'Miles from Downtown', 
   'Avg Effective Rent/Unit']]

In [None]:
# process na values
austin_properties_df = original_df.dropna()
# check the result of na value processing
austin_properties_df.count()

In [None]:
# check the data type
austin_properties_df.dtypes

In [None]:
# overview of the new dataframe
austin_properties_df.head()

In [None]:
austin_properties_df["Year Built"].mean()

# Determine X and y for Machine Learning

In [None]:
X = austin_properties_df.iloc[:, :-1]
y = austin_properties_df['Avg Effective Rent/Unit']

# Splitting the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# 80/20 training and testing data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
# check the shape of training and test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Model (I) - Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# train the model
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

In [None]:
# model evaluation
y_test_predicted = lin_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_predicted)
r2_test = r2_score(y_test, y_test_predicted)
print(f"Mean Squared Error (MSE): {mse_test}")
print(f"R-quared (R2): {r2_test}")

# Model (II) - Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
poly_model = Pipeline([
    ('a', PolynomialFeatures(degree=2)),
    ('b', LinearRegression(fit_intercept=False))
])

In [None]:
poly_model.fit(X_train, y_train)
poly_model.score(X_test, y_test)

# Model (III) Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

forest = RandomForestRegressor()
param_grid = {'n_estimators' : list(range(1,60,2))}
model_rf = GridSearchCV(forest, param_grid, verbose=3)

model_rf.fit(X_train, y_train)
print(model_rf.best_params_)
print(model_rf.best_score_)

y_pred_rf = model_rf.predict(X_test)

In [None]:
import pickle
rf_model_filepath = '/Users/apple/Documents/Data Projects/Random_Forest_Model.sav'
pickle.dump(model_rf, open(rf_model_filepath, 'wb'))

In [None]:
loaded_model = pickle.load(open(rf_model_filepath, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

In [None]:
X_new.shape

In [None]:
rf_model_filepath2 = '/Users/apple/Documents/Data Projects/Machine_Learning_Apartment_Rent/Application/static/resources/Random_Forest_Model.sav'
loaded_model2 = pickle.load(open(rf_model_filepath2, 'rb'))

In [None]:
loaded_model2.predict(X_new)

In [None]:
loaded_model.predict(X_new)

In [None]:
X_new = [1, 1, 750, 0.02, 2018, 1, 1, 1, 0, 0, 100, 100, 70000, 50000, 0, 0]
X_new = pd.to_numeric(X_new)
X_new = np.reshape(X_new, (1, 16))
print(model_rf.predict(X_new))

In [None]:
!pip freeze

# Model (IV) - Neural Network Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
print(len(X_train.columns))
print(X_train.columns)

In [None]:
scaler = MinMaxScaler()

In [None]:
# define neural network model
model_nn = Sequential()
model_nn.add(Dense(12, input_dim=16, kernel_initializer='random_uniform', activation='relu'))
model_nn.add(Dense(8, kernel_initializer='random_uniform', activation='relu'))
model_nn.add(Dense(4, kernel_initializer='random_uniform', activation='relu'))
model_nn.add(Dense(1, kernel_initializer='random_uniform', activation='relu'))

# compile model
model_nn.compile(loss='mse', optimizer='adam')

In [None]:
model_nn.fit(X_train, y_train, batch_size=300, epochs=10000, 
             verbose=1, 
             callbacks=[EarlyStopping(monitor='loss', min_delta=1000, patience=300)])

In [None]:
filepath = "//weights.{epoch:02d}-{loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', 
                             verbose=0, save_best_only=True, 
                             save_weights_only=False, mode='min', period=100)

In [None]:
y_predict = model_nn.predict(X_test)
print(r2_score(y_test, y_predict))

In [None]:
X_new = [1, 1, 750, 0.02, 2018, 1, 1, 1, 0, 0, 100, 100, 70000, 50000, 0, 0]
X_new = pd.to_numeric(X_new)
X_new = np.reshape(X_new, (1, 16))

In [None]:
model_nn.predict(X_new)

In [None]:
X_train.columns

# Model (V) - K Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsRegressor
k_neighbour = KNeighborsRegressor()

from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors' : list(range(1,61,2))}
model_k_neighbour = GridSearchCV(k_neighbour, param_grid, verbose=3)

model_k_neighbour.fit(X_train, y_train)

model_k_neighbour.best_params_

model_k_neighbour.best_score_

y_pred_k_neighbour = model_k_neighbour.predict(X_test)

# Residuals

In [None]:
plt.scatter(y_test_predicted, y_test_predicted-y_test)
plt.hlines(y=0, xmin=y_test_predicted.min(), xmax=y_test_predicted.max())
plt.show()