In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### California Housing Price Prediction
###### Nishant Banjade

#### Import necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression



##### Import Data and get the necessary Information

In [3]:
data=pd.read_csv("../input/california-housing-prices/housing.csv")
data.head()

In [4]:
data.describe()

#### Data's Information

In [5]:
data.info()

In [6]:
data.shape

#### Data Cleaning 

In [7]:
data.isnull().sum()

In [8]:
df_na=data[data.isna().any(axis=1)]
display(df_na.head())

### Build heatmap for observing Null Value

In [9]:
sns.heatmap(data.isna())

#### Describe the total_rooms column

In [10]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='median')


In [11]:
sns.heatmap(data.isna())

In [12]:
data.describe()

### Feature Engineering

#### Get the Outliers and find possible ways to tackle it
#### Histogram of each features are given

In [13]:
min_thresold, max_thresold = data.median_income.quantile([0.01, 0.97])
min_thresold, max_thresold
data[data.median_income < min_thresold]
data[data.median_income>max_thresold]

data= data[(data.median_income < max_thresold) & (data.median_income >min_thresold)]
data.median_income.describe()


In [14]:
plt.figure(figsize=(15,5))
plt.xlabel("Median Income")
plt.ylabel("Density")
plt.hist(data.median_income,bins=200,color='g')
plt.show()

In [15]:
min_thresold, max_thresold = data.total_bedrooms.quantile([0.01, 0.98])
min_thresold, max_thresold


data= data[(data.total_bedrooms < max_thresold) & (data.total_bedrooms >min_thresold)]
data.describe()
data.shape
plt.figure(figsize=(15,5))
plt.xlabel("total_bedroom Income")
plt.ylabel("Density")
plt.hist(data.total_bedrooms,bins=200,color='brown')
plt.show()

In [16]:
min_thresold, max_thresold = data.households.quantile([0.01, 0.99])
min_thresold, max_thresold
data[data.households < min_thresold]
data[data.households>max_thresold]

data= data[(data.households < max_thresold) & (data.households >min_thresold)]

plt.figure(figsize=(15,5))
plt.xlabel("Household")
plt.ylabel("Density")
plt.hist(data.households,bins=200,color='black')
plt.show()

In [17]:
min_thresold, max_thresold = data.total_rooms.quantile([0.01, 0.98])
min_thresold, max_thresold
data[data.total_rooms < min_thresold]
data[data.total_rooms>max_thresold]

data= data[(data.total_rooms < max_thresold) & (data.total_rooms >min_thresold)]

plt.figure(figsize=(15,5))
plt.xlabel("Total Rooms")
plt.ylabel("Density")
plt.hist(data.total_rooms,bins=200,color='purple')
plt.show()

In [18]:
#data["total_bedrooms"]=data["total_bedrooms"].astype('int64')
min_thresold, max_thresold = data.population.quantile([0.001, 0.99])
data= data[(data.population < max_thresold) & (data.population >min_thresold)]

plt.figure(figsize=(15,5))
plt.xlabel("Population")
plt.ylabel("Density")
plt.hist(data.population,bins=200,color='orange')
plt.show()

In [19]:
median=data.total_bedrooms.median()
data.fillna(median,inplace=True)

In [20]:
data.isna().sum()

#### Correlation

In [21]:
scat_mat=data.corr()
scat_mat["median_house_value"].sort_values(ascending=False)

In [22]:
data.plot(kind='scatter',x='median_income',y='median_house_value')

### Latitude Longitude and Population relation

In [23]:
data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=data["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.title('California housing prices')
plt.legend()
plt.savefig("housing_prices_scatterplot.png")

### Lower lattitude and Higher longitude resulted more expensive in Housing Price

In [24]:
data.plot(kind="scatter", x="housing_median_age", y="total_rooms", alpha=0.4,
    s=data["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.title('California housing prices')
plt.legend()
plt.savefig("housing_prices_scatterplot.png")

#### Obviously , Older the house , Lower the price and vice-versa

### Which Feature has given more importance ?

In [25]:
plt.figure(figsize=(15,5))
plt.title("Total Features ")
plt.xlabel("Name of the Features")
plt.ylabel("Summation")
data.iloc[:,:-2].sum().plot(kind='bar',color='green')

#### curve between population and median_house_value

In [26]:
plt.figure(figsize=(15,5))
plt.title("Population density")
plt.xlabel("Population")
plt.ylabel("Density")

population=np.array(data["population"])

sns.kdeplot(population, shade=False,color='purple')

In [27]:
median_house=np.array(data["median_house_value"])
plt.figure(figsize=(15,5))
plt.title("Meidan House")
plt.xlabel("Price")
plt.ylabel("Density")

population=np.array(data["population"])

sns.kdeplot(median_house, shade=False,color='green')


In [28]:
plt.figure(figsize=(15,5))
plt.title("Housing Age")
plt.xlabel("Age")
plt.ylabel("Density")

age=np.array(data["housing_median_age"])

sns.kdeplot(age, shade=False,color='red')

In [29]:
plt.figure(figsize=(25,10))

data.iloc[:,2:-2].sum().plot(kind='pie',autopct='%.1f%%')

#### Ocean_Proximity and Median House Value

In [30]:
ocean_Md=data["median_house_value"].groupby(data['ocean_proximity']).sum()
ocean_Md

In [31]:

plt.figure(figsize=(10,5))
plt.title("Ocean Proximity vs price")
plt.ylabel("Median Housing Price")

ocean_Md.plot(kind='bar',color='brown')

### We can say , housing price is maximum at <1H Ocean

### Ocean Proximity and Population

In [32]:
ocean_p=data["population"].groupby(data['ocean_proximity']).sum()
ocean_p

In [33]:

plt.figure(figsize=(10,5))
plt.title("Ocean Proximity vs Population")
plt.ylabel("Population")

ocean_Md.plot(kind='bar',color='green')

### Machine Learning part

In [34]:
data.head()

### Label Encoder

In [35]:
from sklearn.preprocessing import LabelEncoder
ocean=data["ocean_proximity"].values

lENc=LabelEncoder()
encode=lENc.fit_transform(ocean)
data["ocean_proximity"]=encode


In [36]:
data.hist(bins=50, figsize=(20,15))
plt.savefig("attribute_histogram_plots.png")
plt.show()

In [37]:
data.describe()

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
train_set,test_set=train_test_split(data,test_size=0.2,random_state=2)


In [40]:
from sklearn.model_selection import StratifiedShuffleSplit

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=3)

for i,j in split.split(data,data["ocean_proximity"]):
    strata_train=data.iloc[i]
    strata_test=data.iloc[j]

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
my_pipe=Pipeline([
    ('std',StandardScaler()),
    ('imputer',SimpleImputer())
])

In [42]:
X_train=strata_train.drop("median_house_value",axis=1)
y_train=strata_train["median_house_value"].copy()
y_train


In [43]:
X_test=strata_test.drop("median_house_value",axis=1)
y_test=strata_test["median_house_value"].copy()


In [44]:
X_train=my_pipe.fit_transform(X_train)

In [45]:
#model=LinearRegression().fit(X_train,y_train)
model=RandomForestRegressor().fit(X_train,y_train)
y_pred=model.predict(X_train)


In [None]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_pred,y_train)
rmse=np.sqrt(mse)
rmse

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,X_train,y_train,scoring="neg_mean_squared_error",cv=10)
mse=(-score)
rmse=np.sqrt(mse)
rmse

In [None]:
y_preds=model.predict(X_test)


In [None]:
print("Train_score :",model.score(X_train,y_train))

print("Test Score :",model.score(X_test,y_preds))