In [1]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
tsv_data = pd.read_csv('Resources/county_market_tracker_2.tsv000', sep='\t')
tsv_data.head()

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2012-10-01,2012-10-31,30,county,5,377,f,"Denver County, CO",,Colorado,...,0.125,0.357143,0.075092,0.014286,0.545455,-0.054545,0.545455,"Denver, CO",19740.0,2022-05-15 14:43:42
1,2013-06-01,2013-06-30,30,county,5,358,f,"Ventura County, CA",,California,...,0.129977,0.243421,0.141972,0.013374,0.607843,-0.068925,0.102225,"Oxnard, CA",37100.0,2022-05-15 14:43:42
2,2016-12-01,2016-12-31,30,county,5,1950,f,"Cayuga County, NY",,New York,...,0.074074,0.131579,-0.058132,0.015074,0.25,0.146552,0.05,"Auburn, NY",12180.0,2022-05-15 14:43:42
3,2018-05-01,2018-05-31,30,county,5,326,f,"Merced County, CA",,California,...,-0.05,0.363636,0.292208,0.113636,0.5,,0.5,"Merced, CA",32900.0,2022-05-15 14:43:42
4,2014-08-01,2014-08-31,30,county,5,2898,f,"Daggett County, UT",,Utah,...,0.0,,,,0.0,0.0,,Utah nonmetropolitan area,,2022-05-15 14:43:42


In [3]:
california_df=tsv_data.loc[(tsv_data["state"]=="California")]
california_df=california_df.sort_values(["region","period_end"],ascending=True).fillna(0)
california_df[['Year','Month','Day']] = california_df['period_end'].str.split('-', expand=True).astype(int)
california_df[['region', '']]=california_df['region'].str.split(',', expand=True)
california_df.columns

Index(['period_begin', 'period_end', 'period_duration', 'region_type',
       'region_type_id', 'table_id', 'is_seasonally_adjusted', 'region',
       'city', 'state', 'state_code', 'property_type', 'property_type_id',
       'median_sale_price', 'median_sale_price_mom', 'median_sale_price_yoy',
       'median_list_price', 'median_list_price_mom', 'median_list_price_yoy',
       'median_ppsf', 'median_ppsf_mom', 'median_ppsf_yoy', 'median_list_ppsf',
       'median_list_ppsf_mom', 'median_list_ppsf_yoy', 'homes_sold',
       'homes_sold_mom', 'homes_sold_yoy', 'pending_sales',
       'pending_sales_mom', 'pending_sales_yoy', 'new_listings',
       'new_listings_mom', 'new_listings_yoy', 'inventory', 'inventory_mom',
       'inventory_yoy', 'months_of_supply', 'months_of_supply_mom',
       'months_of_supply_yoy', 'median_dom', 'median_dom_mom',
       'median_dom_yoy', 'avg_sale_to_list', 'avg_sale_to_list_mom',
       'avg_sale_to_list_yoy', 'sold_above_list', 'sold_above_list_mom',
 

In [4]:
ml_df=california_df[['Year','Month', 'region','property_type', 'median_dom','median_sale_price']].reset_index(drop=True)
ml_df=ml_df.sort_values('median_sale_price', ascending=True)
ml_df.index.name="Index"
ml_df['region'].unique()
ml_df.to_csv("Resources/ml.csv")
ml_df=pd.read_csv('Resources/ml.csv', sep=',')
dummy_df=pd.get_dummies(ml_df)

In [6]:
X=dummy_df.drop('median_sale_price', axis=1)
y=dummy_df['median_sale_price'].round(-5)
y.value_counts()
len(X.columns)

63

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# y_train_label=LabelEncoder().fit_transform(y_train)
# y_test_label=LabelEncoder().fit_transform(y_test)

In [9]:
clf = RandomForestClassifier(random_state=41, n_estimators=500).fit(X_train_scaled, y_train)

print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6559552916521132


In [11]:
prediction=clf.predict(X_test_scaled)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

         0.0       0.62      0.29      0.40        17
    100000.0       0.72      0.63      0.67       402
    200000.0       0.72      0.73      0.73      1055
    300000.0       0.69      0.74      0.71      1091
    400000.0       0.71      0.71      0.71       959
    500000.0       0.69      0.67      0.68       593
    600000.0       0.65      0.68      0.67       475
    700000.0       0.58      0.51      0.54       265
    800000.0       0.57      0.57      0.57       240
    900000.0       0.49      0.56      0.52       140
   1000000.0       0.38      0.40      0.39        97
   1100000.0       0.44      0.37      0.40        67
   1200000.0       0.40      0.38      0.39        87
   1300000.0       0.35      0.27      0.31        66
   1400000.0       0.43      0.46      0.45        56
   1500000.0       0.25      0.26      0.26        23
   1600000.0       0.35      0.23      0.28        30
   1700000.0       0.14    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
df=pd.DataFrame({'Actual':y, 'Predicted':predictions})
df.head(50)

ValueError: array length 5726 does not match index length 22904

In [23]:
test=[[20223,6,10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]]

In [24]:
predictions = clf.predict(test)
print(f"The new point was classified at a sales price of ${predictions}00,000")

The new point was classified at a sales price of $[400000.]00,000


In [11]:
prediction_df=X[0:0]
prediction_df.columns=prediction_df.columns.str.replace("region_", "")
prediction_df.columns=prediction_df.columns.str.replace(" County", "")
prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
prediction_df=prediction_df.reset_index(drop=True)
prediction_df
# prediction_df.to_csv("prediction.csv")

Unnamed: 0,Index,Year,Month,median_dom,Alameda,Alpine,Amador,Butte,Calaveras,Colusa,...,Tulare,Tuolumne,Ventura,Yolo,Yuba,All Residential,Condo/Co-op,Multi-Family (2-4 Unit),Single Family Residential,Townhouse


In [15]:
def ML():
#        prediction_df=prediction_df.append(pd.Series(0, index=prediction_df.columns), ignore_index=True)
    prediction_df=X[0:0]
    prediction_df.columns=prediction_df.columns.str.replace("region_", "")
    prediction_df.columns=prediction_df.columns.str.replace(" County", "")
    prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
    year=input("what year?")
    month=input("what month?")
    DOM=input("how long has it been on the market?")
    county=input("What County?")
    house_type=input("what type of house?")
    d={'Year': year, 'Month':month, 'median_dom':DOM, f'{county}':1, f'{house_type}':1}
    prediction_df=prediction_df.append(d, ignore_index=True)
    prediction_df=prediction_df.fillna(value=0)
    predictions = clf.predict(prediction_df)
    print(f"The new point was classified at a sales price of ${predictions}")

In [26]:
d={'Year': year, 'Month':month, 'median_dom':DOM, f'{county}':1, f'{house_type}':1}
prediction_df=X[0:0]
prediction_df.columns=prediction_df.columns.str.replace("region_", "")
prediction_df.columns=prediction_df.columns.str.replace(" County", "")
prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
prediction_df=prediction_df.append(d, ignore_index=True)
prediction_df=prediction_df.fillna(value=0)
prediction_df

NameError: name 'year' is not defined

In [16]:
ML()

what year?2021
what month?1
how long has it been on the market?0
What County?Orange
what type of house?Single Family Residential
The new point was classified at a sales price of $[600000.]


In [22]:
i=0
years=[];
months=[];
sales_price=[]
for x, y in ml_df.iterrows():
    if y[0]>2019 and y[2]==county and y[3]==house_type:
        years.append(y[0])
        months.append(y[1])
        sales_price.append(y[5])
    else:
        next


NameError: name 'county' is not defined