# House Price prediction 
## linear regression vs XGBoost 

In [34]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,confusion_matrix,accuracy_score

In [2]:
df=pd.read_csv("1553768847-housing.csv")
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [4]:
df.describe()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
df.shape

(20640, 10)

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [7]:
# df.isnull().mean()*100

In [8]:
# df["total_bedrooms"].isnull().sum()

In [9]:
df["total_bedrooms"]=df["total_bedrooms"].fillna(df["total_bedrooms"].mean()) 

In [10]:
# df.isnull().sum()

## Encoding

In [11]:
df["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [12]:
oscea_le=LabelEncoder()

In [13]:
oscea_le.fit(df["ocean_proximity"])

In [14]:
df["ocean_proximity"]=oscea_le.transform(df["ocean_proximity"])

In [15]:
df["ocean_proximity"].head(2)

0    3
1    3
Name: ocean_proximity, dtype: int32

In [16]:
X=df.iloc[:,:-1]
y=df["median_house_value"]

## Scalling 

In [17]:
ss=StandardScaler()
scale=ss.fit_transform(X)


In [18]:
# X.columns

In [19]:
X=pd.DataFrame(scale,columns=X.columns)

## Train Test split

In [20]:
X_train, X_test , y_train, y_test =train_test_split(X,y,test_size=0.2,random_state=42)

## LinearRegression

In [21]:
lr=LinearRegression()
lr.fit(X_train,y_train)

In [22]:
lr.score(X_train,y_train)*100,lr.score(X_test,y_test)*100

(64.01079709888613, 61.42406531011784)

In [45]:
y = y.ravel()

  y = y.ravel()


## XGBoost model

In [46]:
xgb=XGBRegressor(n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    # subsample=0.8,
    # colsample_bytree=0.8,
    random_state=42,objective='reg:squarederror')
xgb.fit(X_train,y_train)

# model = XGBClassifier(
#     n_estimators=200,
#     learning_rate=0.05,
#     max_depth=4,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )

In [48]:
xgb.score(X_train,y_train)*100, xgb.score(X_test,y_test)*100

(85.75775623321533, 80.31818866729736)

In [25]:
y_pred = xgb.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

MAE: 34300.3828125
RMSE: 50785.06111052738


In [26]:
rf=RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)

In [27]:
rf.score(X_train,y_train)*100, rf.score(X_test,y_test)*100

(97.45741089799019, 81.0886790271328)

## Cross validation 

In [31]:
from sklearn.model_selection import cross_val_score
cv_score= cross_val_score(xgb,X,y,cv=5 , scoring="r2")


In [32]:
print("Score for each score :",cv_score)
print("score mean :",cv_score.mean())

Score for each score : [0.57957453 0.65909266 0.7201426  0.53944278 0.69742811]
score mean : 0.6391361355781555


## Hyperparameter tuning

In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid = GridSearchCV(
    XGBRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2'
)

grid.fit(X_train, y_train)
grid.best_params_


{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}

In [35]:
best_model= grid.best_estimator_  # best model after huyperparameter tuning

# best model predict 
y_pred_best= best_model.predict(X_test)
mae=mean_absolute_error(y_test,y_pred_best)
print("Mean absolute error after hyperparameter tunning ", mae)

Mean absolute error after hyperparameter tunning  30390.564453125


In [37]:
best_model.score(X_train,y_train)*100, best_model.score(X_test,y_test)*100

(96.61834836006165, 83.52588415145874)

In [57]:
import pandas as pd

importance = best_model.feature_importances_
features = X.columns

imp_df = pd.DataFrame({
    "Feature": features,
    "Importance": importance*100
}).sort_values(by="Importance", ascending=False)

imp_df.head(10)


Unnamed: 0,Feature,Importance
8,ocean_proximity,40.664246
7,median_income,34.486374
1,latitude,6.238896
0,longitude,4.885495
2,housing_median_age,4.266695
5,population,3.116491
4,total_bedrooms,2.651846
6,households,2.03189
3,total_rooms,1.658065
