In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
data_df = pd.read_csv("/kaggle/input/housing.csv")
data_df.head()

In [None]:
data_df["ocean_proximity"].value_counts() #categorical variable

In [None]:
data_df.info() #total_bedrooms has some null values to be handled

In [None]:
data_df.describe() # shows that 255 housing_median_age are lower than 18, while 50% are lower than 29 and 75% are lower than 37

In [None]:
data_df.hist(bins=50, figsize=(20,15)) # shows histogram plots for only numerical attributes

In [None]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_size = int(len(data)*test_ratio)
    test_indices= shuffled_indices[:test_size]
    train_indices= shuffled_indices[test_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(data_df, 0.2)
print(len(train_set), len(test_set))
# or split by sklean api
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data_df, test_size=0.2, random_state=42)# this randomly selects instances/rows for training and test set
print(len(train_set), len(test_set))

In [None]:
""" since median_income seems to be an important attribute to decide median_house_value. So, we decide to divide the numerical values of this attribute \
into catergoies/strata and we want to split our data such that we get equal instances from each category to avoid samping bias. This is called stratified sampling because we divide the data\
homogeneously such that our data is representative of new cases that we may see in test_set and can generalize better
""" 
data_df["income_cat"] = np.ceil(data_df["median_income"]/1.5)
data_df["income_cat"].value_counts() # we can see that most data is divided into 2.0 - 5.0.
data_df["income_cat"].hist()

In [None]:
# so we merge all of the data occuring latter it to one caegory i.e. 5.0
data_df["income_cat"].where(data_df["income_cat"]<5, 5.0, inplace=True)
data_df["income_cat"].value_counts()
data_df["income_cat"].hist()

In [None]:
data_df.head(10)

In [None]:
# Stratified Shuffle split of data based on income_cat
def stratified_spliting():
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data_df, data_df["income_cat"]):
        strat_train_set = data_df.loc[train_index]
        strat_test_set = data_df.loc[test_index]
    return strat_train_set, strat_test_set

In [None]:
strat_train_set, strat_test_set = stratified_spliting()
len(strat_train_set), len(strat_test_set)

In [None]:
strat_train_set["income_cat"].value_counts()/len(strat_train_set)

In [None]:
strat_test_set["income_cat"].value_counts()/len(strat_test_set) # now we can see that samples are evenly distributed among train and test set for income_cat column \
# whereas the test set generated by pure random sampling of train_test_split was skewed datasets

In [None]:
for set in(strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True) #dropping the income_cat column from both sets 

In [None]:
strat_train_set.describe()

In [None]:
strat_test_set.describe()

In [None]:
# visualize the data
#visualizing geographical data for insight based on lattitude and logintude value
data_df.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,5))
plt.show()

In [None]:
# to see high density of data points more clearly , we set aplha = 0.1
data_df.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,5), alpha=0.1)
plt.show()

In [None]:
data_df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.5, figsize=(20,15),
             s= data_df["population"]/100, label= "population", 
             c="median_house_value", cmap = plt.get_cmap("jet"), colorbar=True,) # s, the radius of the circle represents the population, while c, the color represents the prices
# here blue color represents low price while red represents high price
plt.legend()

In [None]:
# so, the housing prices are very much related to the location(ex:- near to the ocean) and population_density
# so, we will now use a clustering algoritm to detect main clusters and add new features that measure the proximity to the cluster centers
#looking for correlations
data_df.drop(["income_cat"], axis=1, inplace=True)
corr_matrix = data_df.corr()
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)
# we notice a strong positive correlation between median_house_value and median_income
# and a small negative correlation between median_house_value and longitude

In [None]:
# to better visualize correlation of certain attributes, we are using pandas.plotting scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(data_df[attributes], figsize=(12,8))

In [None]:
# visualising median_house_value based on median_income
data_df.plot(kind="scatter", x= "median_income", y= "median_house_value", figsize=(15,10))
# the correlation is very strong, we can see the upward trend and the data is not too dispersed

In [None]:
# we can see a straight line closely at 500000, another at 450000, another at 250000, another at 280000 and so on.
# we could later remove thoe instances from our data before sending it to our machine learning algorithm for better prediction and to avoid quirks
# As, we see strong positive correlation between households, population, total_rooms and total_bedrooms
data_df["rooms_per_household"] = data_df["total_rooms"]/data_df["households"]
data_df["bedrooms_per_rooms"] = data_df["total_bedrooms"]/data_df["total_rooms"]
data_df["population_per_household"] = data_df["population"]/data_df["households"]

In [None]:
corr_matrix = data_df.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# we see bedrooms_per_rooms is more strongly negatively correlated with median_house_value than total_bedrooms or total_rooms i.e. lower the no. of rooms , more price required
# Also, rooms_per_household is more strongly related as compared to total_rooms or housholds
# seperating predictors and labels from the strat_training_set
housing =  strat_train_set.drop("median_house_value", axis=1)
housing_lables = strat_train_set["median_house_value"].copy()

In [None]:
housing.describe()

In [None]:
housing_lables.describe()

In [None]:
#data cleaning
#total_bedrooms has missing values. so we can either delete the attribute if its not that related for prediction or delete the instances with missing values or fill the missing values
#delete the instances with missing values - housing.dropna(subset=["total_bedrooms"])
#delete the attribute - housing.drop["total_bedrooms", axis = 1]
#fill the missing values - median = housing["total_bedrooms"].median()
#hosuing["total_bedrooms"].fillna(median)
# save the median to fillna values in the test set too. and also for the new data.
# or use imputer from skcit-learn, it fills na only for numerical attributes by calcluating their median
imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1) # drop the categorical column for imputer
imputer.fit(housing_num) # simply computes median of each attribute and stores it in statistics instance variable
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X= imputer.transform(housing_num) # returns plain numpy array with transformed features
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
housing_num.info()