In [1]:
import pandas as pd
import zipfile
import requests
from io import BytesIO
import os

In [None]:

# Dataset : https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
# Notice descriptive des datasets:
# https://www.data.gouv.fr/fr/datasets/r/d573456c-76eb-4276-b91c-e6b9c89d6656 (fichier pdf)

# URL of the CSV file inside a ZIP archive
url = "https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20241008-071041/valeursfoncieres-2023.txt.zip"
url = "https://www.data.gouv.fr/fr/datasets/r/dd516f7a-91bb-4cad-a63c-4b55cd457f4c"

# Directory and file paths
data_dir = "data"
csv_output_path = os.path.join(data_dir, "valeursfoncieres-2023.csv")
cleaned_csv_path = os.path.join(data_dir, "valeursfoncieres-2023-cleaned.csv")

# Create the data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Download the ZIP file
response = requests.get(url)
response.raise_for_status()  # Raise an exception for HTTP errors

# Unzip the file
with zipfile.ZipFile(BytesIO(response.content)) as z:
    
    #csv_filename = z.namelist()[0]
    #with z.open(csv_filename) as csv_file:
    #        # Save the CSV file to the data folder
    #        with open(csv_output_path, "wb") as output_file:
    #            output_file.write(csv_file.read())
    
    # Assuming the ZIP contains one file, extract and read it
    csv_filename = z.namelist()[0]
    with z.open(csv_filename) as csv_file:
        df = pd.read_csv(csv_file, sep='|', low_memory=False)  # Assuming '|' is the delimiter
        #df = df[df['Commune'] == 'Montpellier']
        df.dropna(axis=1, how='all', inplace=True)

# Show the first few rows of the DataFrame
df


Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,No voie,B/T/Q,Type de voie,Code voie,Voie,Code postal,...,5eme lot,Surface Carrez du 5eme lot,Nombre de lots,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Nature culture speciale,Surface terrain
0,1,05/01/2023,Vente,107000000,184.0,,ALL,0124,DES HETRES,1630.0,...,,,1,3.0,Dépendance,0.0,0.0,,,
1,1,05/01/2023,Vente,107000000,159.0,,ALL,0124,DES HETRES,1630.0,...,,,1,2.0,Appartement,233.0,8.0,,,
2,1,05/01/2023,Vente,107000000,159.0,,ALL,0124,DES HETRES,1630.0,...,,,1,3.0,Dépendance,0.0,0.0,,,
3,1,03/01/2023,Vente,15220000,2914.0,,RTE,0107,DE PONCIN,1450.0,...,,,0,1.0,Maison,64.0,3.0,S,,988.0
4,1,05/01/2023,Vente,26900000,427.0,T,CHE,0040,DE L'AUBEPIN,1800.0,...,,,0,1.0,Maison,73.0,3.0,S,,835.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3799402,1,05/10/2023,Adjudication,900000000,118.0,,RUE,0499,D ASSAS,75006.0,...,,,0,3.0,Dépendance,0.0,0.0,S,,365.0
3799403,1,05/10/2023,Adjudication,900000000,118.0,,RUE,0499,D ASSAS,75006.0,...,,,0,3.0,Dépendance,0.0,0.0,S,,365.0
3799404,1,25/09/2023,Vente,8000000,6.0,,BD,8208,RICHARD LENOIR,75011.0,...,,,2,3.0,Dépendance,0.0,0.0,,,
3799405,1,25/09/2023,Vente,8000000,6.0,,BD,8208,RICHARD LENOIR,75011.0,...,,,2,2.0,Appartement,10.0,1.0,,,


In [13]:
print("Columns in the DataFrame:")
print(df.columns)

Columns in the DataFrame:
Index(['No disposition', 'Date mutation', 'Nature mutation', 'Valeur fonciere',
       'No voie', 'B/T/Q', 'Type de voie', 'Code voie', 'Voie', 'Code postal',
       'Commune', 'Code departement', 'Code commune', 'Prefixe de section',
       'Section', 'No plan', 'No Volume', '1er lot',
       'Surface Carrez du 1er lot', '2eme lot', 'Surface Carrez du 2eme lot',
       '3eme lot', 'Surface Carrez du 3eme lot', '4eme lot',
       'Surface Carrez du 4eme lot', '5eme lot', 'Surface Carrez du 5eme lot',
       'Nombre de lots', 'Code type local', 'Type local',
       'Surface reelle bati', 'Nombre pieces principales', 'Nature culture',
       'Nature culture speciale', 'Surface terrain'],
      dtype='object')


In [20]:
df.shape[0]

3773749

In [5]:
df.dtypes


No disposition                  int64
Date mutation                  object
Nature mutation                object
Valeur fonciere                object
No voie                       float64
B/T/Q                          object
Type de voie                   object
Code voie                      object
Voie                           object
Code postal                   float64
Commune                        object
Code departement               object
Code commune                    int64
Prefixe de section            float64
Section                        object
No plan                         int64
No Volume                      object
1er lot                        object
Surface Carrez du 1er lot      object
2eme lot                       object
Surface Carrez du 2eme lot     object
3eme lot                       object
Surface Carrez du 3eme lot     object
4eme lot                      float64
Surface Carrez du 4eme lot     object
5eme lot                       object
Surface Carr

In [11]:
# change float to int for code postal
df = df.dropna(subset=['Code postal'])
df['Code postal'] = df['Code postal'].astype(str)

# List of postal codes for Montpellier Méditerranée Métropole
codes_postaux_metropole = [
    "34670", "34130", "34160", "34170", "34830", "34660",
    "34690", "34790", "34990", "34970", "34920", "34980",
    "34000", "34070", "34080", "34090", "34570", "34470",
    "34730", "34680", "34430", "30440", "34740"
]


In [12]:
# List of postal codes for Montpellier Méditerranée Métropole
codes_postaux_metropole = [
    "34670", "34130", "34160", "34170", "34830", "34660",
    "34690", "34790", "34990", "34970", "34920", "34980",
    "34000", "34070", "34080", "34090", "34570", "34470",
    "34730", "34680", "34430", "30440", "34740"
]

# isin() pour filtrage 
code_mask = df['Code postal'].isin(codes_postaux_metropole)
df_metropoleMP =df[code_mask]

In [16]:
df_metropoleMP.head()

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,No voie,B/T/Q,Type de voie,Code voie,Voie,Code postal,...,5eme lot,Surface Carrez du 5eme lot,Nombre de lots,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Nature culture speciale,Surface terrain
1045204,1,16/01/2023,Vente,20591000,,,,B040,LE VIALA,30440,...,,,0,,,,,L,,76.0
1045205,1,16/01/2023,Vente,20591000,5096.0,,,B040,LE VIALA,30440,...,,,1,2.0,Appartement,14.0,1.0,,,
1045206,1,16/01/2023,Vente,20591000,5097.0,,,B040,LE VIALA,30440,...,,,1,2.0,Appartement,32.0,1.0,,,
1045207,1,16/01/2023,Vente,20591000,5097.0,A,,B040,LE VIALA,30440,...,,,0,1.0,Maison,95.0,4.0,S,,348.0
1045443,1,05/01/2023,Vente,33700000,,,,B048,LA FIGUIERE,30440,...,,,0,,,,,T,,885.0


In [19]:
df_metropoleMP.shape[0]


27216

In [24]:
df_metropoleMP['Date mutation']= pd.to_datetime(df_metropoleMP['Date mutation'], format='%d/%m/%Y', errors='coerce')

# filtre year 
df_metropoleMP['year'] = df_metropoleMP['Date mutation'].dt.year

# min et max year 
df_metropoleMP['year'].min(), df_metropoleMP['year'].max()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metropoleMP['Date mutation']= pd.to_datetime(df_metropoleMP['Date mutation'], format='%d/%m/%Y', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metropoleMP['year'] = df_metropoleMP['Date mutation'].dt.year


(2023, 2023)