In [1]:
import numpy as np
import pandas as pd
import os

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# to make this notebook's output identical at every run
np.random.seed(42)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
df_house = pd.read_csv('housing.csv')
df_house.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
df_house.fillna(df_house['total_bedrooms'].mean(), inplace=True)

In [5]:
df_house.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


**Data Cleaning**

Treating missing values of attribute "total_bedrooms"

Three options:

1. Get rid of the corresponding values.

2. Get rid of the whole attribute.

3. Set the values to some value (zero, the mean, the median, etc.)

In [6]:
df_house.dropna(subset=["total_bedrooms"])   #option 1
df_house.drop("total_bedrooms",axis = 1)     #option 2
median = df_house["total_bedrooms"].median() #option 3
df_house["total_bedrooms"].fillna(median, inplace = True)

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(strategy = "median")

In [9]:
df_house_num = df_house.drop("ocean_proximity", axis = 1)

In [10]:
imputer.fit(df_house_num)

SimpleImputer(strategy='median')

In [11]:
imputer.statistics_

array([-1.1849e+02,  3.4260e+01,  2.9000e+01,  2.1270e+03,  4.3800e+02,
        1.1660e+03,  4.0900e+02,  3.5348e+00,  1.7970e+05])

In [12]:
df_house_num.median().values

array([-1.1849e+02,  3.4260e+01,  2.9000e+01,  2.1270e+03,  4.3800e+02,
        1.1660e+03,  4.0900e+02,  3.5348e+00,  1.7970e+05])

In [13]:
#Transforming the training set by replacing missing values by the learned medians
X = imputer.transform(df_house_num)

In [14]:
df_house_tr = pd.DataFrame(X, columns = df_house_num.columns)

**Handling Text and Categorical Attributes**


In [15]:
df_house_cat = df_house[["ocean_proximity"]]

In [16]:
df_house_cat.head(10)

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
5,NEAR BAY
6,NEAR BAY
7,NEAR BAY
8,NEAR BAY
9,NEAR BAY


In [17]:
#Converting from texts to numbers

from sklearn.preprocessing import OrdinalEncoder

In [18]:
ordinal_encoder = OrdinalEncoder()

In [19]:
df_house_cat_encoded = ordinal_encoder.fit_transform(df_house_cat)

In [20]:
df_house_cat_encoded[:10]

array([[3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.]])

In [21]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [22]:
# OneHotEncoder class to convert categorical values into one-hot vectors

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
df_house_cat_1hot = cat_encoder.fit_transform(df_house_cat)
df_house_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [23]:
df_house_cat_1hot.toarray() # If you really want to convert it to a dense NumPy array

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [24]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [25]:
#https://www.kaggle.com/suraj99km/predicting-california-house-prices

#https://www.kaggle.com/nahianrifaat/housing-prices-prediction


In [26]:
sns.lmplot('longitude', 'latitude', data=train_df,markers ='.', hue='ocean_proximity', fit_reg=False, height=5)
plt.show()

NameError: name 'sns' is not defined