# Défis open data : JOP2024 et offre culturelle

In [1]:
import json

import pandas as pd

## Jeu de données : sites de compétition

In [2]:
with open("datasets/paris-2024-sites-de-competition.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.json_normalize(data)

In [22]:
df.sample(5)

Unnamed: 0,code_site,nom_site,category_id,sports,start_date,end_date,adress,latitude,longitude,point_geo.lon,point_geo.lat
4,CPL,Arena Porte de la Chapelle,venue-paralympic,"Para badminton (PBDM), Para Haltérophilie (PPWL)",2024-08-29,2024-09-08,,488997292,23605141,2.360514,48.899729
3,CPL,Arena Porte de la Chapelle,venue-olympic,"Badminton (BDM), Gymnastique rythmique (GRY)",2024-07-27,2024-08-10,,488997292,23605141,2.360514,48.899729
56,SP4,Arena Paris Sud 4,venue-olympic,Tennis de table (TTE),2024-07-27,2024-08-10,,48830184,2289033,2.289033,48.830184
14,NPA,Arena Paris Nord,venue-paralympic,Volleyball assis (PVBS),2024-08-29,2024-09-07,,489721,25149,2.5149,48.9721
59,STA,Stade de France,venue-paralympic,"Para Athlétisme (PATH), Cérémonie de clôture P...",2024-08-30,2024-09-08,,48924475,2360127,2.360127,48.924475


In [36]:
df.columns

Index(['code_site', 'nom_site', 'category_id', 'sports', 'start_date',
       'end_date', 'adress', 'latitude', 'longitude', 'point_geo.lon',
       'point_geo.lat'],
      dtype='object')

In [3]:
df.shape

(63, 11)

In [45]:
df.dtypes

code_site                object
nom_site                 object
category_id              object
sports                   object
start_date       datetime64[ns]
end_date                 object
adress                   object
latitude                 object
longitude                object
point_geo.lon           float64
point_geo.lat           float64
dtype: object

`latitude` et `longitude` ne sont pas intéressantes :
* en doublon avec `point_geo.lon` et `point_geo.lat` ;
* pas au format numérique.

Les dates ne sont pas au bon format, essayons de les convertir.

In [48]:
df["start_date"] = pd.to_datetime(df["start_date"])
df["end_date"] = pd.to_datetime(df["end_date"])

df[["start_date", "end_date"]].dtypes

start_date    datetime64[ns]
end_date      datetime64[ns]
dtype: object

La conversion a fonctionné sans erreur.

In [49]:
df.nunique()

code_site        44
nom_site         44
category_id       2
sports           56
start_date       15
end_date         18
adress            0
latitude         43
longitude        43
point_geo.lon    43
point_geo.lat    43
dtype: int64

In [40]:
df["category_id"].unique()

array(['venue-olympic', 'venue-paralympic'], dtype=object)

In [30]:
df["code_site"].value_counts()

code_site
ALX    2
BCY    2
CDM    2
CPL    2
CTX    2
DEF    2
EIF    2
GRP    2
VE1    2
SP1    2
RGA    2
NPA    2
LCO    2
SP4    2
SP6    2
STA    2
VN1    2
VER    2
INV    2
CSB    1
NAN    1
MAM    1
HDV    1
ELA    1
TAH    1
VN2    1
AQC    1
PDP    1
LCS    1
LC3    1
LC2    1
LC1    1
TRO    1
VE2    1
MRS    1
LYO    1
LBO    1
BOR    1
LGN    1
LC4    1
LIL    1
NIC    1
STE    1
YDM    1
Name: count, dtype: int64

In [16]:
df.loc[df["code_site"] == "ALX"]

Unnamed: 0,code_site,nom_site,category_id,sports,start_date,end_date,adress,latitude,longitude,point_geo.lon,point_geo.lat
0,ALX,Pont Alexandre III,venue-olympic,Cyclisme sur route - arrivée Contre-la-montre ...,2024-07-27,2024-08-09,,488637,23134,2.3134,48.8637
43,ALX,Pont Alexandre III,venue-paralympic,Para Triathlon (PTRI),2024-09-01,2024-09-02,,488637,23134,2.3134,48.8637


In [19]:
df["latitude"].value_counts()

latitude
48,78800979          3
48,83863             2
48,8637              2
48,8997292           2
46,8157              2
48,8954              2
48,85613             2
48,86616355          2
48,832968            2
48,9721              2
48,8531              2
48,86570357457259    2
48,830184            2
48,924475            2
48,845968            2
48.81432266          2
48,8623              2
48,85704803          2
48,829381            2
43,2661              1
48,8563881           1
48,906345            1
48,86223209583829    1
-17,86693            1
48,84156974          1
48,923723            1
47,25593494          1
48,86486788          1
48,86504456          1
48,86640642          1
48,78981063          1
43,2698              1
48,85972558          1
45,76514165          1
48.9372382           1
48,93693402          1
44.8974              1
48.7532              1
48,86573765          1
50,61190661          1
43,7051              1
45,4607              1
48,92934371          1
Na

In [20]:
df.loc[df["latitude"] == "48,78800979"]

Unnamed: 0,code_site,nom_site,category_id,sports,start_date,end_date,adress,latitude,longitude,point_geo.lon,point_geo.lat
21,VE1,Vélodrome National de Saint-Quentin-en-Yvelines,venue-paralympic,Para Cyclisme sur piste (PCTR),2024-08-29,2024-09-01,,4878800979,2.03498269,2.034983,48.78801
41,VE1,Vélodrome National de Saint-Quentin-en-Yvelines,venue-olympic,Cyclisme sur piste (CTR),2024-08-05,2024-08-11,,4878800979,2.03498269,2.034983,48.78801
42,VE2,Stade BMX de Saint-Quentin-en-Yvelines,venue-olympic,BMX Racing (BMX),2024-09-01,2024-09-02,,4878800979,2.03498269,2.034983,48.78801


In [34]:
df["start_date"].min(), df["start_date"].max()

('2024-07-24', '2024-09-08')

In [35]:
df["end_date"].min(), df["end_date"].max()

('2024-07-28', '2024-09-10')