## Dragon Real Estate - Price Predictor

In [None]:
import pandas as pd

In [None]:
housing = pd.read_csv("data.csv")

In [None]:
housing.describe()

## Train-Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set  = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Records in train set: {len(train_set)}\nRecords in test set: {len(test_set)}\n")

## Equal Distribution

In [None]:
housing['CHAS'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['CHAS'].value_counts()

In [None]:
strat_train_set['CHAS'].value_counts()

In [None]:
housing = strat_train_set.copy()

## Looking for Correlations

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.85)

In [None]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Creating a Pipeline and Feature scaling and Missing values

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])
#(value-mean)/std

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)

## Selecting a desired model for Dragon Real Estates

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

## Saving the model

In [None]:
from joblib import dump, load
dump(model, 'Dragon.joblib') 

## Testing the model on test data

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

In [None]:
final_rmse

## Using the model

In [None]:
from joblib import dump, load
import numpy as np
model = load('Dragon.joblib') 
features = np.array([prepared_data[0]])
model.predict(features)