In [1]:
# importing dependencies

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.metrics
from sklearn.preprocessing import StandardScaler

In [2]:
# reading in csv of data

df1 = pd.read_csv("Data/Sale_Prices_zip.csv")
df1.head()

Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,...,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02
0,61639,10025,New York,1,,,,,,,...,1218400.0,1024700.0,1031100.0,837800.0,965500.0,896100.0,873100.0,852000.0,869200.0,910000.0
1,84654,60657,Illinois,2,370100.0,391300.0,388700.0,382200.0,381300.0,378500.0,...,371300.0,398000.0,418200.0,424500.0,425700.0,411900.0,400500.0,390300.0,363800.0,380900.0
2,61637,10023,New York,3,,,,,,,...,1127200.0,1227900.0,1142000.0,1145000.0,1049600.0,1014900.0,1044600.0,1142200.0,1051200.0,1136300.0
3,91982,77494,Texas,4,232000.0,240900.0,243000.0,255500.0,253400.0,259000.0,...,331900.0,328300.0,320900.0,315100.0,317500.0,315200.0,319000.0,313000.0,316900.0,
4,84616,60614,Illinois,5,,,,,,,...,510300.0,490900.0,497300.0,468100.0,489700.0,484000.0,537700.0,566600.0,570900.0,555300.0


In [3]:
# dropping columns that we won't be using for our machine learning algorithm

sales_price_df = df1.drop(["RegionID", "StateName", "SizeRank"], axis=1)
sales_price_df = sales_price_df.rename(columns={'RegionName': 'zip_code'})
sales_price_df = sales_price_df.drop(["2020-01", "2020-02"], axis=1)
sales_price_df.head()

Unnamed: 0,zip_code,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,2008-09,2008-10,2008-11,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
0,10025,,,,,,,,,,...,1066800,1038000.0,1218400.0,1024700.0,1031100.0,837800.0,965500.0,896100.0,873100.0,852000.0
1,60657,370100.0,391300.0,388700.0,382200.0,381300.0,378500.0,364600.0,362300.0,357100.0,...,407000,379600.0,371300.0,398000.0,418200.0,424500.0,425700.0,411900.0,400500.0,390300.0
2,10023,,,,,,,,,,...,1181600,1256500.0,1127200.0,1227900.0,1142000.0,1145000.0,1049600.0,1014900.0,1044600.0,1142200.0
3,77494,232000.0,240900.0,243000.0,255500.0,253400.0,259000.0,253200.0,238900.0,228000.0,...,326100,328300.0,331900.0,328300.0,320900.0,315100.0,317500.0,315200.0,319000.0,313000.0
4,60614,,,,,,,,,,...,526100,491200.0,510300.0,490900.0,497300.0,468100.0,489700.0,484000.0,537700.0,566600.0


In [4]:
# checking for NaN values

sales_price_df.isna()

Unnamed: 0,zip_code,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,2008-09,2008-10,2008-11,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
0,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6425,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
6426,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
6427,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
6428,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# iterating through the rows in the df and converting our NaN values 
# into the average value for each column

for columnName, column in sales_price_df.iteritems():
    sales_price_df[columnName] = round(column.fillna(column.mean()),2)

In [6]:
# checking if all columns have been converted

sales_price_df.isna()

Unnamed: 0,zip_code,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,2008-09,2008-10,2008-11,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6425,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6426,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6427,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6428,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# sales_price_df.to_csv('Tableau_data/tableau_salesprice_raw.csv', index=False)

In [8]:
# creating our % change formula that will be utilized as our testing data
# (ending price - starting price/starting price) = % change

sales_price_df["percent_change"] = (sales_price_df["2019-12"] - sales_price_df["2008-03"])/sales_price_df["2008-03"]
sales_price_df.head()

# assessing our equity risk for investment

sales_price_df.loc[sales_price_df['percent_change'] > 0, 'equity_risk'] = True 
sales_price_df.loc[sales_price_df['percent_change'] <= 0, 'equity_risk'] = False

sales_price_df.head()

Unnamed: 0,zip_code,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,2008-09,2008-10,2008-11,...,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,percent_change,equity_risk
0,10025,269259.83,264101.03,257829.16,253550.8,250369.45,248516.75,246508.74,243987.72,240695.55,...,1218400.0,1024700.0,1031100.0,837800.0,965500.0,896100.0,873100.0,852000.0,2.16423,True
1,60657,370100.0,391300.0,388700.0,382200.0,381300.0,378500.0,364600.0,362300.0,357100.0,...,371300.0,398000.0,418200.0,424500.0,425700.0,411900.0,400500.0,390300.0,0.05458,True
2,10023,269259.83,264101.03,257829.16,253550.8,250369.45,248516.75,246508.74,243987.72,240695.55,...,1127200.0,1227900.0,1142000.0,1145000.0,1049600.0,1014900.0,1044600.0,1142200.0,3.241999,True
3,77494,232000.0,240900.0,243000.0,255500.0,253400.0,259000.0,253200.0,238900.0,228000.0,...,331900.0,328300.0,320900.0,315100.0,317500.0,315200.0,319000.0,313000.0,0.349138,True
4,60614,269259.83,264101.03,257829.16,253550.8,250369.45,248516.75,246508.74,243987.72,240695.55,...,510300.0,490900.0,497300.0,468100.0,489700.0,484000.0,537700.0,566600.0,1.104287,True


In [8]:
# setting up our logistic regression model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
# Assign X (data) and y (target)

X = sales_price_df.drop(sales_price_df.columns.to_series()["percent_change":"equity_risk"], axis=1)
y = sales_price_df["equity_risk"]
print(X.shape, y.shape)

(6430, 143) (6430,)


In [10]:
# splitting our values into training and testing variables

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)

In [11]:
# Fit (train) or model using the training data

classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
# Validate the model using the test data

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9998055987558321
Testing Data Score: 0.9930015552099534


In [13]:
# make predictions

predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [True, True, True, True, True, True, True, True, True, True]
First 10 Actual labels: [True, True, True, True, True, True, True, True, True, True]


In [14]:
model_alg_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
model_alg_df

Unnamed: 0,Prediction,Actual
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
...,...,...
1281,False,False
1282,True,True
1283,False,False
1284,True,True


In [15]:
model_alg_df.to_csv("Data/algorithm_predicti")
sales_price_df.to_csv("Data/cleaned_model.csv", index=False)

In [16]:
from sqlalchemy import create_engine

In [17]:
connection_string = "postgres:apartment@homeslice.cjnrjw08kldx.us-east-2.rds.amazonaws.com:5432/HomeSliceDB"
engine = create_engine(f'postgresql://{connection_string}')

In [18]:
sales_price_df.to_sql(name='sales_price', con=engine, if_exists='replace', index=False)