In [None]:
#import the library
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import root_mean_squared_error,r2_score

In [263]:
#import the dataset
data = pd.read_csv("HousingData.csv")
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [265]:
#here we count total number of null present each column
print(data.isnull().sum())

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64


In [266]:
# add mean value in the place of "NA"
data['CRIM'] = data['CRIM'].fillna(data['CRIM'].mean())
data['ZN'] = data['ZN'].fillna(data['ZN'].mean())
data['INDUS'] = data['INDUS'].fillna(data['INDUS'].mean())
data['CHAS'] = data['CHAS'].fillna(data['CHAS'].mean())
data['AGE'] = data['AGE'].fillna(data['AGE'].mean())
data['LSTAT'] = data['LSTAT'].fillna(data['LSTAT'].mean())

data.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,12.715432,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.069959,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.069959,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [267]:
# again use isnull to find null value

print(data.isnull().sum())

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


In [268]:
#split the dataset
# lower status of the population(LSTAT)
# proportion of non-retail business acres per town(INDUS).
# nitric oxides concentration (parts per 10 million)(NOX)
# pupil-teacher ratio by town(PTRATIO)
# average number of rooms per dwelling(RM)
# full-value property-tax rate per $10,000(TAX)
# weighted distances to five Boston employment centres(DIS)
# proportion of owner-occupied units built prior to 1940(AGE)
X = data[['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']]
y = data['MEDV']

In [None]:
#using min-max scaling 
MS = MinMaxScaler()
X = MS.fit_transform(X)

In [270]:
#split tha data in train test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
#here train the model
RFR = RandomForestRegressor()
model = RFR.fit(X_train,y_train)

In [272]:
y_pred = model.predict(X_test)

In [None]:
#root mean squared error
RMSE = root_mean_squared_error(y_test,y_pred)
print("Root Mean Squared Error:", RMSE)

3.1060785992479145

In [None]:
#coefficient of determination(r2_score)
r2 = r2_score(y_test,y_pred)
print("Coefficient of Determination (R^2):", r2)


0.8684409306470123

In [275]:
#user input all the data

LSTAT = float(input("Enter the LSTAT Value:  "))
INDUS = float(input("Enter the INDUS Value:  "))
NOX = float(input("Enter the NOX Value:  "))
PTRATIO = float(input("Enter the PTRATIO Value:  "))
RM = float(input("Enter the RM Value:  "))
TAX = float(input("Enter the TAX Value:  "))
DIS = float(input("Enter the DIS Value:  "))
AGE = float(input("Enter the AGE Value:  "))

#user data scaling
house_data = np.array([LSTAT, INDUS, NOX, PTRATIO, RM, TAX, DIS, AGE])
house_data = house_data.reshape(1,-1)
house_data = MS.transform(house_data)

Enter the LSTAT Value:  23.4
Enter the INDUS Value:  34.5
Enter the NOX Value:  23.4
Enter the PTRATIO Value:  23.4
Enter the RM Value:  54.5
Enter the TAX Value:  32.
Enter the DIS Value:  43.6
Enter the AGE Value:  43.5




In [276]:
house_price = model.predict(house_data)
house_price

array([30.654])

In [277]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(MS, f)