In [2]:
import pandas as pd
import os
import pathlib
from helper_functions import fetch_cal_housing_data, split_test_train

In [3]:
## Collecting the data
if not os.path.exists(pathlib.Path.cwd() / "Datasets"):
    fetch_cal_housing_data()

In [4]:
housing = pd.read_csv("Datasets/housing_data.csv")

In [5]:
housing.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   longitude         20640 non-null  float64
 1   latitude          20640 non-null  float64
 2   housingMedianAge  20640 non-null  float64
 3   totalRooms        20640 non-null  float64
 4   totalBedrooms     20640 non-null  float64
 5   population        20640 non-null  float64
 6   households        20640 non-null  float64
 7   medianIncome      20640 non-null  float64
 8   medianHouseValue  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


### Description of the dataset

In [7]:
housing.describe()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [8]:
%matplotlib inline #tells jupyter to set up matplotlib so it uses Jupyter's own backend
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20,15))
plt.show()

UsageError: unrecognized arguments: #tells jupyter to set up matplotlib so it uses Jupyter's own backend


In [9]:
min(housing["totalBedrooms"])

1.0

### Splitting the dataset

We could do this using our own code

In [10]:
train,test = split_test_train(housing, 0.2)

In [11]:
test

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
2589,-124.10,40.88,35.0,2987.0,578.0,1581.0,585.0,2.0657,81100.0
11053,-117.85,33.78,23.0,3187.0,870.0,1977.0,852.0,3.3939,212100.0
3045,-119.27,35.51,28.0,1089.0,179.0,544.0,190.0,3.2279,95800.0
8488,-118.31,33.90,28.0,1576.0,400.0,891.0,378.0,2.6312,171300.0
8572,-118.41,33.90,39.0,2311.0,404.0,1044.0,380.0,8.4680,472100.0
...,...,...,...,...,...,...,...,...,...
8039,-118.15,33.84,36.0,2987.0,491.0,1360.0,497.0,4.8013,224100.0
15080,-116.97,32.80,15.0,3927.0,1018.0,2204.0,977.0,2.4367,111400.0
12166,-117.14,33.81,13.0,4496.0,756.0,2044.0,695.0,3.2778,148800.0
1556,-121.96,37.81,12.0,6488.0,778.0,2404.0,765.0,8.3188,403400.0


In [12]:
train.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
3639,-118.44,34.22,41.0,1030.0,214.0,664.0,223.0,3.8083,183800.0
8445,-118.37,33.91,35.0,1742.0,283.0,812.0,282.0,5.6704,303700.0
8041,-118.16,33.84,36.0,2444.0,432.0,1199.0,424.0,4.1538,218800.0
15220,-117.08,32.97,3.0,17466.0,3336.0,7644.0,2895.0,5.4584,246500.0
15000,-117.04,32.74,33.0,3880.0,770.0,2288.0,805.0,3.6848,140700.0


Or simply using sklearn library

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [15]:
train_set.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0


In [16]:
test_set.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
20046,-119.01,36.06,25.0,1505.0,367.0,1392.0,359.0,1.6812,47700.0
3024,-119.46,35.14,30.0,2943.0,697.0,1565.0,584.0,2.5313,45800.0
15663,-122.44,37.8,52.0,3830.0,1142.0,1310.0,963.0,3.4801,500001.0
20484,-118.72,34.28,17.0,3051.0,505.0,1705.0,495.0,5.7376,218600.0
9814,-121.93,36.62,34.0,2351.0,440.0,1063.0,428.0,3.725,278000.0
