## Real Estate price predictor

In [91]:
import pandas as pd
import numpy as np

In [92]:
housing = pd.read_csv("data.csv")

In [93]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


### Imputing missing values

In [94]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [95]:
X = imputer.transform(housing)

In [96]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [97]:
housing = housing_tr.copy()

In [39]:
%matplotlib inline

## Train-test split

In [40]:
from sklearn.model_selection import train_test_split
train_set, test_set  = train_test_split(housing, test_size=0.2, random_state=42)

In [41]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [42]:
strat_test_set['CHAS'].value_counts()

0.0    95
1.0     7
Name: CHAS, dtype: int64

In [43]:
housing = strat_train_set.copy()

In [45]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.278597,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609,22.509406
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712367,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574,9.385531
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73,5.0
25%,0.086963,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475,16.6
50%,0.286735,0.0,9.9,0.0,0.538,6.208,78.2,3.1222,5.0,337.0,19.0,390.955,11.57,21.15
75%,3.731923,12.5,18.1,0.0,0.631,6.63,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025,25.0
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98,50.0


In [48]:
housing_label = housing["MEDV"]

In [50]:
housing.drop(labels="MEDV",axis=1,inplace=True)

## Creating pipeline

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    #     ..... add as many as you want in your pipeline
    ('std_scaler', StandardScaler()),
])

In [57]:
housing_num_tr = my_pipeline.fit_transform(housing)

## Selecting a model

In [81]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_label)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

### Evaluting model

In [82]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_label, housing_predictions)
rmse = np.sqrt(mse)

In [83]:
rmse

1.1933083857349471

In [84]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_label, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [85]:
rmse_scores

array([2.8522921 , 2.81695877, 4.42501371, 2.75531342, 3.43868947,
       2.75414277, 4.50029517, 3.30017138, 3.07042074, 3.34204629])

In [86]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [87]:
print_scores(rmse_scores)

Scores: [2.8522921  2.81695877 4.42501371 2.75531342 3.43868947 2.75414277
 4.50029517 3.30017138 3.07042074 3.34204629]
Mean:  3.3255343822521923
Standard deviation:  0.6166010373716494


## Testing

In [88]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [89]:
final_rmse

2.999084714950709

In [90]:
features = np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
       -11.44443979304, -49.31238772,  7.61111401, -26.0016879 , -0.5778192 ,
       -0.97491834,  0.41164221, -66.86091034]])
model.predict(features)

array([24.208])

In [132]:
x = housing.iloc[0:1]

In [133]:
x

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0


In [134]:
x = x.drop(labels="MEDV",axis=1)

In [135]:
x_num = my_pipeline.fit_transform(x)

In [136]:
x_num

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [144]:
scalar =StandardScaler()
scalar.fit(housing.drop(columns="MEDV"))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [145]:
scalar.transform(x)

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41545467, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ]])

In [149]:
def data(pred):
    housing = pd.read_csv("data.csv")
    housing_tr = my_pipeline.fit(housing.drop(columns="MEDV"))
    return my_pipeline.transform(x)

In [150]:
data([0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]) 

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41545467, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ]])