# California Housing Prices [Linear Regression Model]
_____________

#### Steps :
   - Import dataset
   - Cleaning
   - Dealing with the Outliers
   - EDA
   - Define x , y 
   - Split Dataset [ train - test ]
   - Build and train model
   - Predict the test 
   - Evaluate model

In [1]:
! pip install folium



### Data libraries:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns                        
import matplotlib.image as mpimg
%matplotlib inline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression           #build model
from sklearn.model_selection import train_test_split        #data splitting
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import folium as folium
from folium import plugins
from folium.plugins import HeatMap

## Exploring the data

### Load Dataset
#### Read the `housing.csv` dataset into an object named `df`

In [8]:
pwd

'C:\\Users\\asus\\Downloads\\bootcamp-T5\\project2'

In [5]:
ls

 Volume in drive C is OS
 Volume Serial Number is 38C1-4E93

 Directory of C:\Users\asus\Downloads\bootcamp-T5\project2

12/25/21  03:32 PM    <DIR>          .
12/22/21  11:09 PM    <DIR>          ..
12/25/21  02:38 PM    <DIR>          .ipynb_checkpoints
11/29/21  11:25 PM         3,469,851 California Housing Prices (MVP).ipynb
12/05/21  12:40 AM         2,020,801 California Housing Prices.pptx
12/03/21  08:47 PM         3,636,940 California Housing Prices_.ipynb
12/07/21  10:06 AM         2,333,645 California Housing Prices_s121.ipynb
12/01/21  03:34 PM            65,733 california.png
12/25/21  03:32 PM            47,530 Final Code_California Housing Prices.ipynb
11/24/21  04:27 PM         1,423,529 house.csv
12/03/21  09:16 PM         1,163,664 MVP_California Housing Prices (3).ipynb
12/01/21  03:44 PM         3,765,226 MVP_California Housing Prices.ipynb
12/08/21  01:42 PM           540,547 Report California Housing Prices.docx
              10 File(s)     18,467,466 bytes
       

In [10]:
df = pd.read_csv("\\housing.csv")

FileNotFoundError: [Errno 2] No such file or directory: '\\housing.csv'

In [None]:
# Shows the number of rows and columns in the data
df.shape

In [None]:
df.info()

#### Using `.head()` to look at the first five rows of the data set.

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df=df[(df['median_house_value']>0) & (df['median_house_value']<300000)]

In [None]:
df.sample(10)

#### using `.info` to explore columns index, columns names, non-Null count values and data type in the dataset

In [None]:
df.info()

In [None]:
 # Sum total of the null values 
df.isnull().sum().sum()

####  `.isna().sum()` to sum the null values in the dataset for each column.

In [None]:
df.isnull().sum()

#### Using `.describe()` to see the statistics of our values

In [None]:
df.describe()

In [None]:
df['ocean_proximity'].unique() #show unique Row in column

In [None]:
df.ocean_proximity= df.ocean_proximity.str.replace('<', 'less ') # replace symbol < to 'less' in Row
df.ocean_proximity.unique()

In [None]:
bedroom_median = df['total_bedrooms'].median()
bedroom_median

In [None]:
df['total_bedrooms'].fillna(bedroom_median, inplace=True) # fill the null values with bedroom_median

#### Using `.tail()` to look at the last five rows of the data set.

In [None]:
df.tail()

####  `.duplicated().sum()` to sum the duplicated values in the dataset.

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

#### The result of `.value_counts()` is the **ocean_proximity** column's values as its index.

In [None]:
df['ocean_proximity'].value_counts()

## Dealing with the Outliers

#### Defining a function called `plot_outliers` to show the outliers in my plots.
#### Defining a function called `plot_hist` to visualize, and we will use it in visualizing the outliers.

In [None]:
def plot_outliers(df,col):
    plt.title(col)
    ax = sns.boxplot(data=df, x=col)
    ax.set(xlabel='')
    plt.show()

def plot_hist(df,col):
    plt.hist(x=df[col],bins=40,color='#D11239')
    plt.show();

In [None]:
for col in df.columns:
    if df[col].dtype == 'float64':
        plot_outliers(df,col)
        plot_hist(df,col)

#### Bedrooms

In [None]:
firstQ = df['total_bedrooms'].quantile(0.25)
thirdQ = df['total_bedrooms'].quantile(0.75)
IQR = thirdQ - firstQ
IQR

In [None]:
newB = thirdQ + 3*(IQR)
df.drop(df[df['total_bedrooms']>newB].index, axis = 0, inplace =True)

In [None]:
df.shape

#### population

In [None]:
firstQ_p = df['population'].quantile(0.25)
thirdQ_p = df['population'].quantile(0.75)
IQR_p = thirdQ_p - firstQ_p
IQR_p

In [None]:
newB_p = thirdQ_p + 3*(IQR_p)
df.drop(df[df['population']>newB_p].index, axis = 0, inplace =True)
df.shape

#### households

In [None]:
firstQ_h = df['households'].quantile(0.25)
thirdQ_h = df['households'].quantile(0.75)
IQR_h = thirdQ_h - firstQ_h
IQR_h

In [None]:
newB_h = thirdQ_h + 3*(IQR_h)
df.drop(df[df['households']>newB_h].index, axis = 0, inplace =True)
df.shape

#### Total rooms

In [None]:
firstQ_t = df['total_rooms'].quantile(0.25)
thirdQ_t = df['total_rooms'].quantile(0.75)
IQR_t = thirdQ_t - firstQ_t
IQR_t

In [None]:
newB_t = thirdQ_t + 3*(IQR_t)
df.drop(df[df['total_rooms']>newB_t].index, axis = 0, inplace =True)
df.shape

In [None]:
df.describe()

--------
## EDA 

# Q1: Does the age of the house affect on the house value?

In [None]:
sns.lineplot(data=df, x="housing_median_age", y="median_house_value")   

#### Using `.corr()` to show the correlation between the columns

In [None]:
df.corr()

In [None]:
df.corr()['median_house_value'].sort_values(ascending=False) 
#to show the correlation between median_house_value column and the other columns 

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(),cmap='YlGnBu',annot=True)
plt.show

#visualize the correlation

#### Defining a function called `show_values()` to show values in my plots

In [None]:
def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.1f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.1f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)


# Q2: Where does people prefer on their house location?

#### `.countplot()` to count how many time the variable has been repeated.

In [None]:

x1=sns.countplot(x=df['ocean_proximity'], data=df)
show_values(x1)

## Q3: Does the proximity from the house to the ocean affect its price?

#### Using `.imread` in the map plot to insert an image

In [None]:
df.plot(kind = "scatter", x="longitude" , y="latitude" , c =df["median_house_value"], s =df["population"]/100,
        alpha = 0.4, cmap = plt.get_cmap("jet"),label='population', figsize = (12,8))


#load png image
map_image = mpimg.imread("california.png")

plt.imshow(map_image, extent = [-124.55, -114, 32.55, 42.05], alpha = 0.4, cmap = plt.get_cmap("jet"))

plt.xlabel("longitude")
plt.ylabel("latitude")

- the closer we are to the ocean, the more expensive the houses values become

## Q4: What is the most city that had the highest population?

In [None]:
california_map = folium.Map(location=[36.7783,-119.4179], zoom_start = 6, min_zoom=5)
df_map = df[['latitude', 'longitude']]
data = [[row['latitude'],row['longitude']] for index, row in df_map.iterrows()]
_ = HeatMap(data, radius=10).add_to(california_map)
california_map

In [None]:
sns.scatterplot(x=df['median_income'], y=df['median_house_value'], hue=df['median_house_value'],alpha=0.4)

#df.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)

#### Using `.get_dummies` to change the object value to (0,1) values

In [None]:
df = pd.get_dummies(df, prefix=None)
df.head()

---------------
## Define x , y

In [None]:
X = df.drop(columns=["median_house_value","longitude", "latitude","ocean_proximity_ISLAND" ]).astype(float)
y = df.loc[:,'median_house_value'].astype(float)

In [None]:
X

In [None]:
y

In [None]:
sns.pairplot(df, x_vars=['housing_median_age','total_rooms','total_bedrooms'
                         ,'population','households','median_income','ocean_proximity_INLAND','ocean_proximity_NEAR BAY','ocean_proximity_NEAR OCEAN','ocean_proximity_less 1H OCEAN'], y_vars='median_house_value',diag_kind='kde')

---------------

## Split dataset [ train - vaidation - test ]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=0)

In [None]:
### Actual train,test and val
y_actual_train=y_train
y_actual_test=y_test

----------
## Scale

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

----------

## The Model

## Linear Regression Model

In [None]:
my_model = LinearRegression()
my_model.fit(X_train , y_train)

In [None]:
print(f'The Linear regression coefficients are: {my_model.coef_}')
print(f'The Linear regression interception is: {my_model.intercept_}')

In [None]:
# Predict Value
y_predict = my_model.predict(X_test)
y_predict

In [None]:
#real Value
y_test

In [None]:
# Regression Score
train_data_score = my_model.score(X_train , y_train)
test_data_score = my_model.score(X_test , y_test)


print(f'The Linear regression train R^2 is: {train_data_score}')
print(f'The Linear regression test R^2 is: {test_data_score}')

In [None]:
y_pred_lr = my_model.predict(X_test)
r2_score(y_test, y_pred_lr)

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_pred_lr)).mean() 
RMSE = math.sqrt(MSE)
print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

In [None]:
y_train_pred_linear = my_model.predict(X_train)
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(y_train, y_train_pred_linear, alpha=0.2, color='slateblue')

m, b = np.polyfit(y_train, y_train_pred_linear, 1)
# m = slope 
# b = intercept

plt.plot(y_train, m*y_train + b, color='midnightblue')

plt.xlabel('Actual Target Train', fontsize=14)
plt.ylabel('Predicted Target Train', fontsize=14)
plt.title('Linear regression Predicted VS. Actual Target Train', fontsize=16)
plt.grid(linewidth=0.5)

------
### Decision tree regressor model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
DT_model = DecisionTreeRegressor(max_depth=4)
DT_model.fit(X_train,y_train)

In [None]:
# Predict Value
y_predict_dt = DT_model.predict(X_test)
y_predict_dt

In [None]:
#real Value
y_test

In [None]:
## Score
train_data_score_dt = DT_model.score(X_train , y_train)
test_data_score_dt = DT_model.score(X_test , y_test)

print(f'The Decision tree regressor train R^2 is: {train_data_score_dt}')
print(f'The Decision tree regressor test R^2 is: {test_data_score_dt}')

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_predict_dt)).mean() 
RMSE = math.sqrt(MSE)
print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

-----
## Lasso Regression

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from numpy import mean
from numpy import std
from numpy import absolute

In [None]:
lasso_model = Lasso(alpha=1.0)

In [None]:
lasso_model.fit(X_train_scaled, y_train)

In [None]:
# Predict Value
y_predict_lasso = lasso_model.predict(X_test)
y_predict_lasso

In [None]:
## Score
train_data_score_ls = lasso_model.score(X_train_scaled , y_train)
test_data_score_ls = lasso_model.score(X_test_scaled , y_test)

print(f'The Ridge regression train R^2 is: {train_data_score_ls}')
print(f'The Ridge regression test R^2 is: {test_data_score_ls}')

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_predict_lasso)).mean() 
RMSE = math.sqrt(MSE)

print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

In [None]:
y_train_pred_lasso = lasso_model.predict(X_train)
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(y_train, y_train_pred_lasso, alpha=0.2, color='slateblue')

m, b = np.polyfit(y_train, y_train_pred_lasso, 1)
# m = slope 
# b = intercept

plt.plot(y_train, m*y_train + b, color='midnightblue')

plt.xlabel('Actual Target Train', fontsize=14)
plt.ylabel('Predicted Target Train', fontsize=14)
plt.title('Predicted VS. Actual Target Train', fontsize=16)
plt.grid(linewidth=0.5)

## Ridge Regression

In [None]:
rl = Ridge(alpha=1)

In [None]:
rl.fit(X_train_scaled, y_train)

In [None]:
#Score
rl_train = rl.score(X_train_scaled, y_train)
rl_test = rl.score(X_test_scaled, y_test)


print(f'The Ridge regression train R^2 is: {rl_train}')
print(f'The Ridge regression test R^2 is: {rl_test}')

In [None]:
# Predict Value
y_predict_ri = rl.predict(X_test)
y_predict_ri

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_predict_ri)).mean() 
RMSE = math.sqrt(MSE)

print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

In [None]:
y_train_pred_rl = rl.predict(X_train)
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(y_train, y_train_pred_rl, alpha=0.2, color='slateblue')

m, b = np.polyfit(y_train, y_train_pred_rl, 1)
# m = slope 
# b = intercept

plt.plot(y_train, m*y_train + b, color='midnightblue')

plt.xlabel('Actual Target Train', fontsize=14)
plt.ylabel('Predicted Target Train', fontsize=14)
plt.title('Predicted VS. Actual Target Train', fontsize=16)
plt.grid(linewidth=0.5)

## Random Forest Regressor Model

In [None]:
forest=RandomForestRegressor(n_estimators=5)
forest.fit(X_train,y_train)

In [None]:
y_pr_f=forest.predict(X_test)
y_pr_f

In [None]:
#real Value
y_test

In [None]:
train_data_score_forest = forest.score(X_train , y_train)
test_data_score_forest = forest.score(X_test , y_test)

print(f'The Random Forest Regressor train R^2 is: {train_data_score_forest}')
print(f'The Random Forest Regressor test R^2 is: {test_data_score_forest}')

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_pr_f)).mean() 
RMSE = math.sqrt(MSE)

print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

In [None]:
y_train_pred_forest = forest.predict(X_train)
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(y_train, y_train_pred_forest, alpha=0.2, color='slateblue')

m, b = np.polyfit(y_train, y_train_pred_forest, 1)
# m = slope 
# b = intercept

plt.plot(y_train, m*y_train + b, color='midnightblue')

plt.xlabel('Actual Target Train', fontsize=14)
plt.ylabel('Predicted Target Train', fontsize=14)
plt.title('Predicted VS. Actual Target Train', fontsize=16)
plt.grid(linewidth=0.5)

## Polynomial Regression Model

In [None]:
poly = PolynomialFeatures(degree=3) 

X_train_poly = poly.fit_transform(X_train.values)
X_test_poly = poly.transform(X_test.values)

poly_model = LinearRegression()

In [None]:
poly_model.fit(X_train_poly, y_train)

In [None]:
X_poly = poly.fit_transform(X_test)
regressor = LinearRegression()
regressor.fit(X_poly,y_test)
y_poly_pred=regressor.predict(X_poly)

In [None]:
#Score
poly_test=poly_model.score(X_test_poly, y_test)
poly_train=poly_model.score(X_train_poly, y_train)

print(f'The polynomial regression train R^2 is: {poly_train}')
print(f'polynomial regression test R^2 is: {poly_test}')

In [None]:
# Evaluate
import math
MSE = np.square(np.subtract(y_actual_test,y_poly_pred)).mean() 
RMSE = math.sqrt(MSE)

print(f'Mean Absolute Error: {MSE}')
print(f'Root Mean Square Error: {RMSE}')

In [None]:
y_train_pred_poly = poly_model.predict(X_train_poly)
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(y_train, y_train_pred_poly, alpha=0.2, color='slateblue')

m, b = np.polyfit(y_train, y_train_pred_poly, 1)
# m = slope 
# b = intercept

plt.plot(y_train, m*y_train + b, color='midnightblue')

plt.xlabel('Actual Target Train (Global Sales)', fontsize=14)
plt.ylabel('Predicted Target Train (Global Sales)', fontsize=14)
plt.title('Polynomial regression Predicted VS. Actual Target Train', fontsize=16)
plt.grid(linewidth=0.5)