In [1]:
# import des librairies
import pandas as pd
pd.options.display.max_columns = 7

# Lire/explorer les données
df = pd.read_csv('housing.csv')

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,...,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,...,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,...,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,...,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,...,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,...,3.8462,342200.0,NEAR BAY


In [2]:
# infos sur les données
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
# accéder à une colonne en particulier
df['longitude']

0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: ocean_proximity, Length: 20640, dtype: object

In [None]:
df['ocean_proximity']

In [5]:
# changer 'object' en 'category' pour accélérer les traitements
df = df.astype({'ocean_proximity':'category'})
df['ocean_proximity']

0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: ocean_proximity, Length: 20640, dtype: category
Categories (5, object): ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [6]:
# filtrer les résultats
df['median_house_value']

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [8]:
# afficher les lignes où la valeur de la maison est > 100000
df[df['median_house_value']>100000]

Unnamed: 0,longitude,latitude,housing_median_age,...,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,...,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,...,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,...,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,...,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,...,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...
20629,-121.39,39.12,28.0,...,2.0943,108300.0,INLAND
20630,-121.32,39.29,11.0,...,3.5673,112000.0,INLAND
20631,-121.40,39.33,15.0,...,3.5179,107200.0,INLAND
20632,-121.45,39.26,15.0,...,3.1250,115600.0,INLAND


In [9]:
# Ingénierie des données: créer des données
# à partir des données déjà existantes

df['rooms_per_household'] = df['total_rooms']/df['households']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   longitude            20640 non-null  float64 
 1   latitude             20640 non-null  float64 
 2   housing_median_age   20640 non-null  float64 
 3   total_rooms          20640 non-null  float64 
 4   total_bedrooms       20433 non-null  float64 
 5   population           20640 non-null  float64 
 6   households           20640 non-null  float64 
 7   median_income        20640 non-null  float64 
 8   median_house_value   20640 non-null  float64 
 9   ocean_proximity      20640 non-null  category
 10  rooms_per_household  20640 non-null  float64 
dtypes: category(1), float64(10)
memory usage: 1.6 MB


In [11]:
df['rooms_per_household']

0        6.984127
1        6.238137
2        8.288136
3        5.817352
4        6.281853
           ...   
20635    5.045455
20636    6.114035
20637    5.205543
20638    5.329513
20639    5.254717
Name: rooms_per_household, Length: 20640, dtype: float64

In [12]:
df['bedrooms_per_household'] = df['total_bedrooms']/df['households']
df.info()
df['bedrooms_per_household']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   longitude               20640 non-null  float64 
 1   latitude                20640 non-null  float64 
 2   housing_median_age      20640 non-null  float64 
 3   total_rooms             20640 non-null  float64 
 4   total_bedrooms          20433 non-null  float64 
 5   population              20640 non-null  float64 
 6   households              20640 non-null  float64 
 7   median_income           20640 non-null  float64 
 8   median_house_value      20640 non-null  float64 
 9   ocean_proximity         20640 non-null  category
 10  rooms_per_household     20640 non-null  float64 
 11  bedrooms_per_household  20433 non-null  float64 
dtypes: category(1), float64(11)
memory usage: 1.8 MB


0        1.023810
1        0.971880
2        1.073446
3        1.073059
4        1.081081
           ...   
20635    1.133333
20636    1.315789
20637    1.120092
20638    1.171920
20639    1.162264
Name: bedrooms_per_household, Length: 20640, dtype: float64

In [13]:
df['population_per_household'] = df['population']/df['households']
df.info()
df['population_per_household']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 20640 non-null  float64 
 1   latitude                  20640 non-null  float64 
 2   housing_median_age        20640 non-null  float64 
 3   total_rooms               20640 non-null  float64 
 4   total_bedrooms            20433 non-null  float64 
 5   population                20640 non-null  float64 
 6   households                20640 non-null  float64 
 7   median_income             20640 non-null  float64 
 8   median_house_value        20640 non-null  float64 
 9   ocean_proximity           20640 non-null  category
 10  rooms_per_household       20640 non-null  float64 
 11  bedrooms_per_household    20433 non-null  float64 
 12  population_per_household  20640 non-null  float64 
dtypes: category(1), float64(12)
memory usage: 1.9 

0        2.555556
1        2.109842
2        2.802260
3        2.547945
4        2.181467
           ...   
20635    2.560606
20636    3.122807
20637    2.325635
20638    2.123209
20639    2.616981
Name: population_per_household, Length: 20640, dtype: float64

In [None]:
# Supprimer les données qui ne vont pas servir à l'analyse

df = df.drop(['latitude'], axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 20640 non-null  float64 
 1   housing_median_age        20640 non-null  float64 
 2   total_rooms               20640 non-null  float64 
 3   total_bedrooms            20433 non-null  float64 
 4   population                20640 non-null  float64 
 5   households                20640 non-null  float64 
 6   median_income             20640 non-null  float64 
 7   median_house_value        20640 non-null  float64 
 8   ocean_proximity           20640 non-null  category
 9   rooms_per_household       20640 non-null  float64 
 10  bedrooms_per_household    20433 non-null  float64 
 11  population_per_household  20640 non-null  float64 
dtypes: category(1), float64(11)
memory usage: 1.8 MB


In [17]:
df = df.drop(['longitude'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   housing_median_age        20640 non-null  float64 
 1   total_rooms               20640 non-null  float64 
 2   total_bedrooms            20433 non-null  float64 
 3   population                20640 non-null  float64 
 4   households                20640 non-null  float64 
 5   median_income             20640 non-null  float64 
 6   median_house_value        20640 non-null  float64 
 7   ocean_proximity           20640 non-null  category
 8   rooms_per_household       20640 non-null  float64 
 9   bedrooms_per_household    20433 non-null  float64 
 10  population_per_household  20640 non-null  float64 
dtypes: category(1), float64(10)
memory usage: 1.6 MB
