In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tpot import TPOTRegressor
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('usa_real_estate/realtor-data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306000 entries, 0 to 305999
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   status          306000 non-null  object 
 1   bed             250076 non-null  float64
 2   bath            251318 non-null  float64
 3   acre_lot        239877 non-null  float64
 4   city            305948 non-null  object 
 5   state           306000 non-null  object 
 6   zip_code        305803 non-null  float64
 7   house_size      252557 non-null  float64
 8   prev_sold_date  109589 non-null  object 
 9   price           306000 non-null  float64
dtypes: float64(6), object(4)
memory usage: 23.3+ MB


In [5]:
df.head(10)

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0
5,for_sale,4.0,3.0,0.46,San Sebastian,Puerto Rico,612.0,2520.0,,179000.0
6,for_sale,3.0,1.0,0.2,Ciales,Puerto Rico,639.0,2040.0,,50000.0
7,for_sale,3.0,2.0,0.08,Ponce,Puerto Rico,731.0,1050.0,,71600.0
8,for_sale,2.0,1.0,0.09,Ponce,Puerto Rico,730.0,1092.0,,100000.0
9,for_sale,5.0,3.0,7.46,Las Marias,Puerto Rico,670.0,5403.0,,300000.0


In [6]:
df.drop(['prev_sold_date'],axis=1,inplace=True)

In [7]:
df.isna().any()#Existe missing data
print(df.isna().sum())#soma total de missing = 302

status            0
bed           55924
bath          54682
acre_lot      66123
city             52
state             0
zip_code        197
house_size    53443
price             0
dtype: int64


In [8]:
for i in df.columns:
    df[i].fillna(df[i].mode()[0], inplace=True)
df.isna().sum()

status        0
bed           0
bath          0
acre_lot      0
city          0
state         0
zip_code      0
house_size    0
price         0
dtype: int64

In [18]:
# Label Encoder
lb_make = LabelEncoder()
df["status"] = lb_make.fit_transform(df['status'])
df["city"] = lb_make.fit_transform(df['city'])
df["state"] = lb_make.fit_transform(df['state'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306000 entries, 0 to 305999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   status      306000 non-null  int64  
 1   bed         306000 non-null  float64
 2   bath        306000 non-null  float64
 3   acre_lot    306000 non-null  float64
 4   city        306000 non-null  int64  
 5   state       306000 non-null  int64  
 6   zip_code    306000 non-null  float64
 7   house_size  306000 non-null  float64
 8   price       306000 non-null  float64
dtypes: float64(6), int64(3)
memory usage: 21.0 MB


In [27]:
x = df[["status","bed","bath","acre_lot","city","state","zip_code","house_size"]].to_numpy()
y = df["price"].to_numpy()

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=2022)

In [32]:
tpot = TPOTRegressor(generations=3, population_size=5, verbosity=2, random_state=42,subsample=.5,max_eval_time_mins=3,n_jobs=1)
tpot.fit(X_train, y_train)

Version 0.11.7 of tpot is outdated. Version 0.12.0 was released 7 days ago.


Optimization Progress:   0%|          | 0/20 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -107028643849.56516

Generation 2 - Current best internal CV score: -107028643849.56516

Generation 3 - Current best internal CV score: -107028643849.56516

Best pipeline: KNeighborsRegressor(input_matrix, n_neighbors=51, p=1, weights=distance)


In [33]:
preds = tpot.predict(X_test)
print(r2_score(y_test, preds))

0.9589975582334129


In [34]:
print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)),'\n')

MAE: 18394.501406131913
MSE: 67370148975.943214
RMSE: 259557.6024237071 



In [35]:
tpot.export('tpot_regression_test.py')
print(tpot.export())

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -107028643849.56516
exported_pipeline = KNeighborsRegressor(n_neighbors=51, p=1, weights="distance")
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)



In [45]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=51, p=1, weights="distance")
model.fit(X_train, y_train)

preds = model.predict(X_test)
print(r2_score(y_test, preds))

print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)),'\n')

0.9836846602343573
MAE: 9427.67249319403
MSE: 26807351544.12662
RMSE: 163729.50724938564 

