<a href="https://www.kaggle.com/code/arunjangir245/house-price-predictor?scriptVersionId=143531081" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="text-align: center; background-color: #ccffcc; color: #006600; padding: 20px; border-radius: 5px;">
    <h2 style="margin: 0; font-size: 13px;">Don't forget to upvote if you liked the notebook</h2>
</div>

# Dragon Real Estate-Price predictor


In [None]:
import pandas as pd

In [None]:
housing = pd.read_csv("/kaggle/input/boston-housing-dataset/BostonHousing.csv")

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['chas'].value_counts()

In [None]:
housing['rad'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline

In [None]:
#for plotting histogram
#import matplotlib.pyplot as plt
#housing.hist(bins=50,figsize=(15,20))
#plt.show

## Train-Test Splitting

In [None]:
#for learning purpose only
import numpy as np
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    print(shuffled)
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [None]:
#train_set,test_set = split_train_test(housing,0.2)

In [None]:
#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")
    

In [None]:
#print("Rows in train set:",len(train_set))
#print("Rows in test set:",len(test_set))


In [None]:
from sklearn.model_selection import train_test_split
train_set,test_set = train_test_split(housing,test_size=0.2,random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index,test_index in split.split(housing,housing['chas']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

In [None]:
strat_test_set['chas'].value_counts()

In [None]:
strat_test_set

In [None]:
strat_train_set['chas'].value_counts()


In [None]:
#95/7

In [None]:
#376/28

In [None]:
housing = strat_train_set.copy()

## looking for correlations

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["medv","rm","zn","lstat"]
scatter_matrix(housing[attributes],figsize = (12,8))

In [None]:
housing.plot(kind="scatter",x="rm",y="medv",alpha=0.8)

## Trying out  Attribute combinations

In [None]:
housing["taxrm"] = housing["tax"]/housing["rm"]

In [None]:
housing["taxrm"]

In [None]:
corr_matrix = housing.corr()
corr_matrix['medv'].sort_values(ascending=False)

In [None]:
housing.plot(kind="scatter",x="taxrm",y="medv",alpha=0.8)

In [None]:
housing = strat_train_set.drop(["medv"],axis=1)
housing_labels = strat_train_set["medv"].copy()

# Missing Attributes

In [None]:
#To take care of missing attributes, you three options:
 #   1.Get rid of the missing data points
  #  2.Get rid of the whole attribute
   # 3.Set the value to some value(0,mean or median)

In [None]:
a=housing.dropna(subset=["rm"]) #option1
a.shape
#Note that the original housing dataframe will remain unchanged

In [None]:
housing.drop("rm",axis=1).shape #option2
#Note that there is no rm column and also note that the original housing dataframe will remain unchanged

In [None]:
median = housing["rm"].median() 
# Compute median for option 3

In [None]:
housing["rm"].fillna(median) #Option3
# Note that the original housing dataframe will remain unchanged

In [None]:
housing.shape

In [None]:
housing.describe()  #before we started filling missing attributes

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing)

In [None]:
imputer.statistics_

In [None]:
imputer.statistics_.shape

In [None]:
X = imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(X,columns=housing.columns)

In [None]:
housing_tr.describe()

## Scikit-learn Design

Primarily, three types of objects
1. Estimators - It estimates some parameter based on a dataset. Eg. imputer
It has a fit method and transfrom method. 

Fit method - Fits the dataset and calculates internal parameters 
2. Transformers - transform method takes input and returns output based on the 
learnings from fit(). It also has a convenience function called fit_transform() which fits and then transforms
3. Predictors - LinearRegression model is an example of predictor. fit() and
predict() are two common functions. It also gives score() function which will 
evaluate the predictions.


# Feature Scaling

Primarily, two types of feature scaling methods:
1.Min-max scaling(Normalization)
  (value - min) / (max - min)
  sklearn provides a class called MinMaxScaler for this
  
2.Standardization 
   (value - mean) / std
   Sklearn provides a class called Standard Scaler for this

## Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline  = Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
    #     ....add as many as you want in your pipeline
    ('std_scaler',StandardScaler()),
])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [None]:
housing_num_tr.shape

## Selecting a desired model for Dragon Real Estates

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr,housing_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

## Evaluating the model

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

## Using better evaluation technique - Cross Validation

In [None]:
# 1 2 3 4 5 6 7 8 9 10
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring = "neg_mean_squared_error",cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
def print_scores(scores):
    print("scores:",scores)
    print("Mean:",scores.mean())
    print("Standard deviation:",scores.std())

In [None]:
print_scores(rmse_scores)

## Saving The Model

In [None]:
from joblib import dump, load
dump(model, 'Dragon.joblib')

## Testing the model on test data

In [None]:
X_test = strat_test_set.drop("medv",axis=1)
Y_test = strat_test_set["medv"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
#print(final_predictions,list(Y_test))

In [None]:
final_rmse

In [None]:
prepared_data[0]

In [None]:
from joblib import dump, load
import numpy as np
model = load('Dragon.joblib')

In [None]:
Feature = np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.23979304, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(Feature)