In [2]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [3]:
tsv_data = pd.read_csv('county_market_tracker_2.tsv000', sep='\t')
tsv_data.head()

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2020-07-01,2020-07-31,30,county,5,3101,f,"Stevens County, WA",,Washington,...,0.071523,0.16,-0.133578,-0.088705,0.571429,-0.103571,0.31746,"Spokane, WA",44060.0,2022-04-10 14:38:16
1,2021-05-01,2021-05-31,30,county,5,3182,f,"Fond du Lac County, WI",,Wisconsin,...,0.253968,0.064356,0.00221,-0.075178,0.195652,-0.009167,0.039402,"Fond du Lac, WI",22540.0,2022-04-10 14:38:16
2,2015-02-01,2015-02-28,30,county,5,1870,f,"Humboldt County, NV",,Nevada,...,,0.071429,-0.071429,,,,,"Winnemucca, NV",49080.0,2022-04-10 14:38:16
3,2019-04-01,2019-04-30,30,county,5,2327,f,"Clackamas County, OR",,Oregon,...,-0.073306,0.290008,0.037114,0.00298,0.569444,-0.027194,-0.022595,"Portland, OR",38900.0,2022-04-10 14:38:16
4,2020-11-01,2020-11-30,30,county,5,2168,f,"Butler County, OH",,Ohio,...,0.347403,0.136364,-0.012573,0.042024,0.227273,0.093939,0.183794,"Cincinnati, OH",17140.0,2022-04-10 14:38:16


In [5]:
california_df=tsv_data.loc[(tsv_data["state"]=="California")]
california_df=california_df.sort_values(["region","period_end"],ascending=True).fillna(0)
california_df[['Year','Month','Day']] = california_df['period_end'].str.split('-', expand=True).astype(int)
california_df[['region', '']]=california_df['region'].str.split(',', expand=True)
california_df.columns

Index(['period_begin', 'period_end', 'period_duration', 'region_type',
       'region_type_id', 'table_id', 'is_seasonally_adjusted', 'region',
       'city', 'state', 'state_code', 'property_type', 'property_type_id',
       'median_sale_price', 'median_sale_price_mom', 'median_sale_price_yoy',
       'median_list_price', 'median_list_price_mom', 'median_list_price_yoy',
       'median_ppsf', 'median_ppsf_mom', 'median_ppsf_yoy', 'median_list_ppsf',
       'median_list_ppsf_mom', 'median_list_ppsf_yoy', 'homes_sold',
       'homes_sold_mom', 'homes_sold_yoy', 'pending_sales',
       'pending_sales_mom', 'pending_sales_yoy', 'new_listings',
       'new_listings_mom', 'new_listings_yoy', 'inventory', 'inventory_mom',
       'inventory_yoy', 'months_of_supply', 'months_of_supply_mom',
       'months_of_supply_yoy', 'median_dom', 'median_dom_mom',
       'median_dom_yoy', 'avg_sale_to_list', 'avg_sale_to_list_mom',
       'avg_sale_to_list_yoy', 'sold_above_list', 'sold_above_list_mom',
 

In [151]:
# ml_df=california_df[['Year','Month', 'region','property_type','median_dom', 'median_sale_price']].reset_index(drop=True)
ml_df=california_df[['Year','Month', 'region','property_type', 'median_dom','median_sale_price']].reset_index(drop=True)
ml_df.index.name="Index"
dummy_df=pd.get_dummies(ml_df)
dummy_df['median_sale_price_mom'].min()

-0.9779329608938548

In [127]:
X=dummy_df.drop('median_sale_price', axis=1)
y=dummy_df['median_sale_price'].round(-5)
y.value_counts()
X.columns

Index(['Year', 'Month', 'median_sale_price_mom', 'region_Alameda County',
       'region_Alpine County', 'region_Amador County', 'region_Butte County',
       'region_Calaveras County', 'region_Colusa County',
       'region_Contra Costa County', 'region_El Dorado County',
       'region_Fresno County', 'region_Glenn County', 'region_Kern County',
       'region_Lake County', 'region_Los Angeles County',
       'region_Madera County', 'region_Marin County', 'region_Mariposa County',
       'region_Merced County', 'region_Monterey County', 'region_Napa County',
       'region_Nevada County', 'region_Orange County', 'region_Placer County',
       'region_Riverside County', 'region_Sacramento County',
       'region_San Benito County', 'region_San Bernardino County',
       'region_San Diego County', 'region_San Francisco County',
       'region_San Joaquin County', 'region_San Luis Obispo County',
       'region_San Mateo County', 'region_Santa Barbara County',
       'region_Santa Clara

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_label=LabelEncoder().fit_transform(y_train)
y_test_label=LabelEncoder().fit_transform(y_test)

In [129]:
clf = RandomForestClassifier(random_state=41, n_estimators=500).fit(X_train_scaled, y_train_label)

print(f"Training Data Score: {clf.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test_label)}")

Training Data Score: 1.0
Testing Data Score: 0.6974708171206225


In [94]:
prediction=clf.predict(X_test_scaled)
print(classification_report(y_test_label, prediction))

              precision    recall  f1-score   support

           0       0.20      0.11      0.14         9
           1       0.70      0.69      0.70       307
           2       0.77      0.76      0.76       927
           3       0.72      0.72      0.72       893
           4       0.69      0.70      0.69       822
           5       0.63      0.67      0.65       552
           6       0.68      0.67      0.68       474
           7       0.62      0.54      0.58       278
           8       0.59      0.62      0.61       242
           9       0.42      0.43      0.42       131
          10       0.33      0.36      0.35        97
          11       0.30      0.24      0.26        68
          12       0.37      0.42      0.40        85
          13       0.39      0.24      0.30        71
          14       0.36      0.43      0.39        60
          15       0.17      0.20      0.18        25
          16       0.30      0.38      0.34        26
          17       0.14    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
df=pd.DataFrame({'Actual':y, 'Predicted':predictions})
df.head(50)

Unnamed: 0_level_0,Actual,Predicted
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,300000.0,4
1,200000.0,4
2,200000.0,2
3,300000.0,4
4,300000.0,4
5,300000.0,4
6,200000.0,2
7,300000.0,4
8,300000.0,4
9,400000.0,4


In [148]:
test=[[2023,6,-10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]]

In [149]:
predictions = clf.predict(test)
print(f"The new point was classified at a sales price of ${predictions}00,000")

The new point was classified at a sales price of $[4]00,000


In [40]:
prediction_df=X[0:0]
prediction_df.columns=prediction_df.columns.str.replace("region_", "")
prediction_df.columns=prediction_df.columns.str.replace(" County", "")
prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
prediction_df

Unnamed: 0_level_0,Year,Month,median_dom,Alameda,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,...,Stanislaus,Sutter,Ventura,Yolo,Yuba,All Residential,Condo/Co-op,Multi-Family (2-4 Unit),Single Family Residential,Townhouse
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [48]:
def ML():
    #    prediction_df=prediction_df.append(pd.Series(0, index=prediction_df.columns), ignore_index=True)
    prediction_df=X[0:0]
    prediction_df.columns=prediction_df.columns.str.replace("region_", "")
    prediction_df.columns=prediction_df.columns.str.replace(" County", "")
    prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
    year=input("what year?")
    month=input("what month?")
    DOM=input("how long has it been on the market?")
    county=input("What County?")
    house_type=input("what type of house?")
    d={'Year': year, 'Month':month, 'median_dom':DOM, f'{county}':1, f'{house_type}':1}
    prediction_df=prediction_df.append(d, ignore_index=True)
    prediction_df=prediction_df.fillna(value=0)
    predictions = clf.predict(prediction_df)
    print(f"The new point was classified at a sales price of ${predictions}00,000")

In [46]:
d={'Year': year, 'Month':month, 'median_dom':DOM, f'{county}':1, f'{house_type}':1}
prediction_df=X[0:0]
prediction_df.columns=prediction_df.columns.str.replace("region_", "")
prediction_df.columns=prediction_df.columns.str.replace(" County", "")
prediction_df.columns=prediction_df.columns.str.replace("property_type_", "")
prediction_df=prediction_df.append(d, ignore_index=True)
prediction_df=prediction_df.fillna(value=0)
prediction_df

Unnamed: 0,Year,Month,median_dom,Alameda,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,...,Sutter,Ventura,Yolo,Yuba,All Residential,Condo/Co-op,Multi-Family (2-4 Unit),Single Family Residential,Townhouse,single family residential
0,2025,6,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [113]:
ML()

what year?2015
what month?1
how long has it been on the market?30
What County?Orange
what type of house?Single Family Residential
The new point was classified at a sales price of $[4]00,000
