In [1]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import os
import matplotlib as plt

# 1. Get the data

In [2]:
def loadData():
    path = "Data/housing.tgz"
    if not os.path.isfile(path):
        os.mkdir("Data")
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url,path)
        with tarfile.open(path) as housing_tarball:
            housing_tarball.extractall(path="Data")
    return pd.read_csv(Path("Data/housing/housing.csv"))

In [3]:
housing = loadData()

# 2. Explore and visualize the data to gain insights

In [4]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Notes:  
   **total_bedrooms** : 20433/20640 non-null value  
   **ocean_proximity**: object type  

In [6]:
housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

## Create a test set  
After all, we have only taken a quick glance at data, and surely you learn a lot more about it before deciding what algorithms to use. But our brain is an amazing pattern detection system, 

In [9]:
import numpy as np

def shuffle_and_split_data(data, test_ratio):
    shuffle_data = np.random.permutation(len(data))
    test_size = len(data)*test_ratio

split_data = shuffle_and_split_data(housing, 0.2)
print(split_data)

[10646  7770 17624 ... 11264  4705  3874]


# 3. Prepare the data for machine learning algorithms

# 4. Select model and train for it

# 5 Fine-tune your model

# 6. Present your solution

# 7. Launch, Monitor and maintain your system