# Dragon real state price predictor

In [1]:
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np

In [2]:
housing = datasets.load_boston()
File = pd.DataFrame(housing.data, columns = housing.feature_names)
File = File.assign(Target = housing.target)

In [3]:
#File.info()

In [4]:
File['CHAS'].value_counts()

0.0    471
1.0     35
Name: CHAS, dtype: int64

In [5]:
#File.describe()

In [6]:
#File.hist(bins = 50, figsize = (10,15))

## Train-Test splitting

In [7]:
def split_data(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [8]:
train_set, test_set = split_data(File,0.2)

In [9]:
print(f"Rows in train set : {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set : 405
Rows in test set: 101



### So 405 dataset is for training and 101 dataset for testing

### What we have done above is present in sklearn library. so we will use 

In [10]:
#from sklearn.model_selection import train_test_split
#train_set, test_set = train_test_split(File, test_size = 0.2, random_state = 42)

In [11]:
#print(f"Rows in train set : {len(train_set)}\nRows in test set: {len(test_set)}\n")

### Now it may happen that the data may not be proprtionaly distributed among the taring and testing datasets.

### So to do that again sklearn comes to resque.

In [12]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(File,File['CHAS']):
    strat_train_set = File.loc[train_index]
    strat_test_set = File.loc[test_index]

In [13]:
housing = strat_train_set.copy()

## Looking for correlations

In [14]:
corr_matrix = File.corr()

In [15]:
corr_matrix['Target'].sort_values(ascending = False)

Target     1.000000
RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: Target, dtype: float64

## Above output is of correlation of the target with features
### RM and ZN are postively correlated. so we will use this for plotting scatter_matrix

In [16]:
#from pandas.plotting import scatter_matrix
#attributes = ['Target', 'RM', 'ZN', 'LSTAT']
#scatter_matrix(File[attributes], figsize = (12,8))

In [17]:
#File.plot(kind = "scatter", x = "RM", y = "Target", alpha = 0.8)

## Separting features and label from training dataset

In [18]:
housing.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'Target'],
      dtype='object')

In [19]:
housing = strat_train_set.drop("Target", axis = 1)
housing_labels = strat_train_set["Target"].copy()

In [20]:
housing.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')

### We can make attribute by ourself using other attributes

In [21]:
#File['TAXRM'] = File['TAX'] / File['RM']
#File['TAXRM']

In [22]:
#corr_matrix = File.corr()
#corr_matrix['Target'].sort_values(ascending = False)

### Handling the cases for missing attributes
+Get rid of the msissing data points
***Get rid of the whole attribute
***Set the value to some value(0, mean or meadian)

### sklearn.impute.imputer does same job

## Scikit-learn design

primarily, three types of objects
1. Estimators - It estimates some parameters based on the dataset Ex. Imputer. It has fit method and transform method. Fit method - Fits the dataset and calculates internal parameters.

2. Transformers - transform method takes input and returns output based on the learnings from the fit(). It also has a convenience function called fit_transform() which fits and then transforms.

3. Predictors - LinearRegression model is and example of predictor. fit() and predict() are two common functions. It also gives score() function which will evaluate the predictions.

## Feature Scaling

Primarily, two types of features scaling methods:
1. Min-max sacling (Normalization)
    (value - min) / (max - min)
    sklearn provides a class called MinMaxScaler for this.
2. Standardization
    (value - mean)/std
    sklearn provides a class called StandardScaler for this.

## Creating pipeline

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('std_scaler', StandardScaler()),
])

In [24]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [25]:
housing_num_tr.shape

(404, 13)

## Selecting a desired model for dragon real estate

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [27]:
#model = DecisionTreeRegressor()
#model = LinearRegression()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

RandomForestRegressor()

In [28]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
prepared_data = my_pipeline.transform(some_data)
model.predict(prepared_data)

array([22.508, 25.587, 16.363, 23.376, 23.391])

In [29]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

In [43]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24141041, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

## Evaluating the model

In [30]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1.1631531338870584

## Using better evaluation technique - cross validation

In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)

In [32]:
rmse_scores

array([2.79289168, 2.69441597, 4.40018895, 2.56972379, 3.33073436,
       2.62687167, 4.77007351, 3.27403209, 3.38378214, 3.16691711])

In [33]:
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [34]:
print_scores(rmse_scores)

Scores:  [2.79289168 2.69441597 4.40018895 2.56972379 3.33073436 2.62687167
 4.77007351 3.27403209 3.38378214 3.16691711]
Mean:  3.3009631251857217
Standard deviation:  0.7076841067486248


## Saving the model

In [35]:
from joblib import dump, load
dump(model, 'Dragon.joblib')

['Dragon.joblib']

## Testing the model

In [41]:
X_test = strat_test_set.drop("Target", axis = 1)
Y_test = strat_test_set["Target"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[24.895 11.466 25.439 21.974 18.471 14.873 19.897 14.416 31.435 40.68
 20.067 11.776 24.004 28.999 19.503 10.679 31.58  14.492 23.586 18.94
 19.767 17.955 17.534 22.073 18.429 30.549 16.416 32.69   8.952 33.616
 23.715 21.26  22.98  10.808 20.924 11.293 42.541 24.308 23.292 41.623
 23.753 29.423 20.594 20.912 19.565 33.578 44.468 19.92  20.344 21.768
 21.492 14.514 21.169 15.094 24.771 32.651 42.389 28.135 19.353 20.837
 47.322  9.96  18.674 24.579 15.057 32.746 19.461 18.158 19.014 33.84
 27.27  22.85  21.519 22.436 34.997 12.793 15.94  20.048 20.726 21.379
 22.303 21.651 14.429 22.832 20.848 21.168 13.933 21.346 22.005 23.151
 18.865 27.24   7.276 26.14  18.817 29.887 19.692 31.073 14.663 26.607
 20.713 20.087] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1, 4

In [37]:
print_scores(final_rmse)

2.948844070638726

In [44]:
prepared_data[0]


array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24141041, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

## Using the model

In [45]:
from joblib import dump, load
model = load('Dragon.joblib')
input = np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24141041, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(input)

array([22.508])