In [8]:
import requests
import json
from dotenv import load_dotenv
import os


load_dotenv()

def fetch_line_reports():
    # URL API
    url = 'https://api.navitia.io/v1/coverage/fr-idf/line_reports'

    # clé api from env
    api_key = os.getenv('token_navitia')
    headers = {
        'Authorization': api_key
    }
    
    # requete
    response = requests.get(url, headers=headers)

  
    if response.status_code == 200:
       
        data = response.json()
        # ecriture requete positif en hson
        with open('line_reports.json', 'w') as json_file:
            json.dump(data, json_file, indent=4)
        print("Data written to line_reports.json")
    else:
        
        print('Failed to retrieve data: ', response.status_code)

# Appel func
fetch_traffic_reports()


Data written to line_reports.json


In [18]:
import pandas as pd
from datetime import datetime

with open('line_reports.json') as f:
    data = json.load(f)

disruptions_data = []

for disruption in data['disruptions']:
    for impacted_object in disruption['impacted_objects']:
        pt_object = impacted_object['pt_object']
        # Check 'stop_point' key existe 
        
        if 'stop_point' in pt_object:  
            stop_point = pt_object['stop_point']
            location_name = stop_point['name']
            location_label = stop_point['label']
            longitude = stop_point['coord']['lon']
            latitude = stop_point['coord']['lat']
            address_label = stop_point['address']['label']
            fare_zone = stop_point['fare_zone']['name']
            transport_type = pt_object['embedded_type']
        else:
            # none si on trouve pas
            location_name = location_label = longitude = latitude = address_label = fare_zone = transport_type = None
        
        # check pour le mode de transport quand il se trouve dedans
        if 'line' in pt_object and 'physical_modes' in pt_object['line']:
            physical_mode = pt_object['line']['physical_modes'][0]['name']
        else:
            physical_mode = None

        disruption_info = {
            "disruption_id": disruption['id'],
            "status": disruption['status'],
            "cause": disruption['cause'],
            "category": disruption['category'],
            "severity_name": disruption['severity']['name'],
            "severity_effect": disruption['severity']['effect'],
            "severity_color": disruption['severity']['color'],
            "severity_priority": disruption['severity']['priority'],
            "message_text": disruption['messages'][0]['text'],
            "application_period_begin": datetime.strptime(disruption['application_periods'][0]['begin'], '%Y%m%dT%H%M%S'),
            "application_period_end": datetime.strptime(disruption['application_periods'][0]['end'], '%Y%m%dT%H%M%S'),
            "location_name": location_name,
            "location_label": location_label,
            "longitude": longitude,
            "latitude": latitude,
            "address_label": address_label,
            "fare_zone": fare_zone,
            "transport_type": transport_type,
            "physical_mode": physical_mode
        }
        
        disruptions_data.append(disruption_info)

# dataframe gang
df_disruptions = pd.DataFrame(disruptions_data)

df_disruptions

Unnamed: 0,disruption_id,status,cause,category,severity_name,severity_effect,severity_color,severity_priority,message_text,application_period_begin,application_period_end,location_name,location_label,longitude,latitude,address_label,fare_zone,transport_type,physical_mode
0,e9cbe72a-04c5-11ef-9db7-0a58a9feac02,active,travaux,Incidents,bloquante,NO_SERVICE,#FF0000,0,<p>La ligne 272 est déviée : les arrêts situés...,2024-04-17 06:00:00,2024-06-01 03:45:00,Clément Ader,Clément Ader (Argenteuil),2.24347,48.942583,Rue de la Tour Billy (Argenteuil),4,stop_point,
1,e9cbe72a-04c5-11ef-9db7-0a58a9feac02,active,travaux,Incidents,bloquante,NO_SERVICE,#FF0000,0,<p>La ligne 272 est déviée : les arrêts situés...,2024-04-17 06:00:00,2024-06-01 03:45:00,Charles de Gaulle - Henri Barbusse,Charles de Gaulle - Henri Barbusse (Argenteuil),2.245179,48.940498,Avenue du Général de Gaulle (Argenteuil),4,stop_point,
2,e9cbe72a-04c5-11ef-9db7-0a58a9feac02,active,travaux,Incidents,bloquante,NO_SERVICE,#FF0000,0,<p>La ligne 272 est déviée : les arrêts situés...,2024-04-17 06:00:00,2024-06-01 03:45:00,Place du 11 Novembre,Place du 11 Novembre (Argenteuil),2.239094,48.936521,Nouvelle Impasse Duguay (Argenteuil),4,stop_point,
3,e9cbe72a-04c5-11ef-9db7-0a58a9feac02,active,travaux,Incidents,bloquante,NO_SERVICE,#FF0000,0,<p>La ligne 272 est déviée : les arrêts situés...,2024-04-17 06:00:00,2024-06-01 03:45:00,Avenue du Château,Avenue du Château (Argenteuil),2.233204,48.933282,Allée Simone Veil (Argenteuil),4,stop_point,
4,e9cbe72a-04c5-11ef-9db7-0a58a9feac02,active,travaux,Incidents,bloquante,NO_SERVICE,#FF0000,0,<p>La ligne 272 est déviée : les arrêts situés...,2024-04-17 06:00:00,2024-06-01 03:45:00,,,,,,,,Bus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,f7588214-069a-11ef-98d5-0a58a9feac02,active,perturbation,Incidents,perturbée,SIGNIFICANT_DELAYS,#EF662F,30,"<p><span style=""background-color:rgb(255,255,2...",2024-04-29 17:15:00,2024-04-30 23:59:00,,,,,,,,Bus
100,f7588214-069a-11ef-98d5-0a58a9feac02,active,perturbation,Incidents,perturbée,SIGNIFICANT_DELAYS,#EF662F,30,"<p><span style=""background-color:rgb(255,255,2...",2024-04-29 17:15:00,2024-04-30 23:59:00,,,,,,,,Bus
101,f7588214-069a-11ef-98d5-0a58a9feac02,active,perturbation,Incidents,perturbée,SIGNIFICANT_DELAYS,#EF662F,30,"<p><span style=""background-color:rgb(255,255,2...",2024-04-29 17:15:00,2024-04-30 23:59:00,,,,,,,,Bus
102,f7588214-069a-11ef-98d5-0a58a9feac02,active,perturbation,Incidents,perturbée,SIGNIFICANT_DELAYS,#EF662F,30,"<p><span style=""background-color:rgb(255,255,2...",2024-04-29 17:15:00,2024-04-30 23:59:00,,,,,,,,Bus


In [1]:
from meteofrance_api import MeteoFranceClient

# Create a client instance
client = MeteoFranceClient()

# Search for a location
places = client.search_places('Paris')

# Get the forecast for the first result
forecast = client.get_forecast_for_place(places[0])

# Print the forecast data
print(forecast.daily_forecast)


[{'dt': 1714435200, 'T': {'min': 12.6, 'max': 19.1, 'sea': None}, 'humidity': {'min': 65, 'max': 90}, 'precipitation': {'24h': 0.8}, 'uv': 4, 'weather12H': {'icon': 'p3j', 'desc': 'Très nuageux'}, 'sun': {'rise': 1714451455, 'set': 1714503928}}, {'dt': 1714521600, 'T': {'min': 14.1, 'max': 20.2, 'sea': None}, 'humidity': {'min': 65, 'max': 90}, 'precipitation': {'24h': 8.5}, 'uv': 4, 'weather12H': {'icon': 'p13j', 'desc': 'Pluies éparses'}, 'sun': {'rise': 1714537753, 'set': 1714590415}}, {'dt': 1714608000, 'T': {'min': 12.1, 'max': 14.1, 'sea': None}, 'humidity': {'min': 80, 'max': 95}, 'precipitation': {'24h': 5.5}, 'uv': 3, 'weather12H': {'icon': 'p13j', 'desc': 'Pluies éparses'}, 'sun': {'rise': 1714624052, 'set': 1714676902}}, {'dt': 1714694400, 'T': {'min': 10.5, 'max': 16.6, 'sea': None}, 'humidity': {'min': 65, 'max': 95}, 'precipitation': {'24h': 0}, 'uv': 4, 'weather12H': {'icon': 'p2j', 'desc': 'Eclaircies'}, 'sun': {'rise': 1714710353, 'set': 1714763389}}, {'dt': 1714780800

In [4]:
rain_data = client.get_rain()
print(rain_data.forecast)


TypeError: MeteoFranceClient.get_rain() missing 2 required positional arguments: 'latitude' and 'longitude'

In [6]:
from meteofrance_api import MeteoFranceClient
from datetime import datetime, timedelta

client = MeteoFranceClient()

# Supposons que cette fonction puisse récupérer des données historiques
def fetch_weather_data(place, start_date, end_date):
    current_date = start_date
    weather_data = []
    while current_date <= end_date:
        forecast = client.get_historical_data(place, current_date)
        weather_data.append(forecast)
        current_date += timedelta(days=1)  # Incrément journalier
    return weather_data

# Recherche du lieu
places = client.search_places('Paris')

# Définir les dates de début et de fin pour l'année 2023
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

# Récupérer les données météo
weather_data = fetch_weather_data(places[0], start_date, end_date)
print(weather_data)


AttributeError: 'MeteoFranceClient' object has no attribute 'get_historical_data'

In [8]:
import pandas as pd

df = pd.read_csv('Q_01_previous-1950-2022_RR-T-Vent.csv', delimiter=';')
df

Unnamed: 0,NUM_POSTE,NOM_USUEL,LAT,LON,ALTI,AAAAMMJJ,RR,QRR,TN,QTN,...,DXI2,QDXI2,HXI2,QHXI2,FXI3S,QFXI3S,DXI3S,QDXI3S,HXI3S,QHXI3S
0,1010001,ANGLEFORT,45.900000,5.766667,280,19500101,0.0,1.0,,,...,,,,,,,,,,
1,1010001,ANGLEFORT,45.900000,5.766667,280,19500102,0.0,1.0,,,...,,,,,,,,,,
2,1010001,ANGLEFORT,45.900000,5.766667,280,19500103,13.8,1.0,,,...,,,,,,,,,,
3,1010001,ANGLEFORT,45.900000,5.766667,280,19500104,5.4,1.0,,,...,,,,,,,,,,
4,1010001,ANGLEFORT,45.900000,5.766667,280,19500105,0.0,1.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955878,1457001,VONNAS,46.218333,4.990000,190,19920327,0.9,1.0,,,...,,,,,,,,,,
955879,1457001,VONNAS,46.218333,4.990000,190,19920328,0.7,1.0,,,...,,,,,,,,,,
955880,1457001,VONNAS,46.218333,4.990000,190,19920329,0.0,1.0,,,...,,,,,,,,,,
955881,1457001,VONNAS,46.218333,4.990000,190,19920330,18.6,1.0,,,...,,,,,,,,,,


In [28]:
import pandas as pd
import os
from collections import defaultdict

# Function to parse descriptive files
def parse_descriptive_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    field_names = {}
    for line in lines:
        parts = line.strip().split(":")
        if len(parts) == 2:
            field, description = parts
            field_names[field.strip()] = description.strip()
    return field_names

# Base directory for files (update with the correct path to your CSVs)
base_dir = './'

# Descriptive field mappings
desc_fields = {
    'autres-parametres': parse_descriptive_file(os.path.join(base_dir, 'Q_descriptif_champs_autres-parametres.csv')),
    'RR-T-Vent': parse_descriptive_file(os.path.join(base_dir, 'Q_descriptif_champs_RR-T-Vent.csv'))
}

# Function to process and concatenate datasets
def process_and_concatenate_datasets():
    # Dictionaries to store dataframes grouped by year
    dataframes = {
        'autres-parametres': defaultdict(list),
        'RR-T-Vent': defaultdict(list)
    }
    
    # Read and group datasets by year
    for filename in os.listdir(base_dir):
        if filename.endswith('.csv') and 'descriptif' not in filename:
            print(f"Reading {filename}...")
            year_group = filename.split('_')[1]  # Assumes the year is the second element in the filename
            file_type = 'autres-parametres' if 'autres-parametres' in filename else 'RR-T-Vent'
            dataset = pd.read_csv(os.path.join(base_dir, filename), sep=';')
            dataframes[file_type][year_group].append(dataset)
    
    # Concatenate dataframes within the same year and apply descriptive fields
    concatenated_datasets = {}
    for file_type, year_groups in dataframes.items():
        for year, dfs in year_groups.items():
            concatenated_df = pd.concat(dfs, ignore_index=True)
            concatenated_df.rename(columns=desc_fields[file_type], inplace=True)
            concatenated_datasets[f"{file_type}_{year}"] = concatenated_df
            print(f"Processed and concatenated {file_type} for {year}")
    
    return concatenated_datasets

# Execute the function and get the processed datasets
concatenated_datasets = process_and_concatenate_datasets()




Reading Q_75_1816-1949_autres-parametres.csv...
Reading Q_75_1816-1949_RR-T-Vent.csv...
Reading Q_75_latest-2023-2024_autres-parametres.csv...
Reading Q_75_latest-2023-2024_RR-T-Vent.csv...
Reading Q_75_previous-1950-2022_autres-parametres.csv...
Reading Q_75_previous-1950-2022_RR-T-Vent.csv...
Processed and concatenated autres-parametres for 75
Processed and concatenated RR-T-Vent for 75


KeyError: 'Q_75'

In [40]:
import pandas as pd
import numpy as np

# List of strings that are recognized as NaN values
na = ['NaN', 'nan', 'NAN', ' NaN', 'NAN ']

# Load the datasets, converting "NaN" strings to actual NaN values
df1 = pd.read_csv('Q_75_previous-1950-2022_autres-parametres.csv', sep=';', na_values=na)
df2 = pd.read_csv('Q_75_previous-1950-2022_RR-T-Vent.csv', sep=';', na_values=na)

# Merge the datasets on the 'AAAAMMJJ' column using an outer join
merged_df = pd.merge(df1, df2, on='AAAAMMJJ', how='outer')

# Drop columns where all elements are NaN post-merge
merged_df.dropna(axis=1, how='all', inplace=True)

# After merging and dropping fully empty columns, recheck and drop any columns that might have become completely NaN
merged_df.dropna(axis=1, how='all', inplace=True)

# Display the first few rows of the merged DataFrame to verify the results
print(merged_df.head())


   NUM_POSTE_x       NOM_USUEL_x      LAT_x     LON_x  ALTI_x  AAAAMMJJ  \
0     75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
1     75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
2     75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
3     75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
4     75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   

   PMERM  QPMERM  PMERMIN  QPMERMIN  ...  QFXI  DXI  QDXI  HXI  QHXI  FXI3S  \
0    NaN     NaN      NaN       NaN  ...   NaN  NaN   NaN  NaN   NaN    NaN   
1    NaN     NaN      NaN       NaN  ...   NaN  NaN   NaN  NaN   NaN    NaN   
2    NaN     NaN      NaN       NaN  ...   NaN  NaN   NaN  NaN   NaN    NaN   
3    NaN     NaN      NaN       NaN  ...   NaN  NaN   NaN  NaN   NaN    NaN   
4    NaN     NaN      NaN       NaN  ...   NaN  NaN   NaN  NaN   NaN    NaN   

   QFXI3S  QDXI3S  HXI3S  QHXI3S  
0     NaN     NaN    NaN     NaN  
1   

In [41]:
for column in merged_df.columns:
    # Get unique values in each column
    unique_values = merged_df[column].unique()
    # Print the column name and its unique values
    print(f"Unique values in '{column}': {unique_values}\n")

Unique values in 'NUM_POSTE_x': [75114001 75112001 75116003 75104001 75110002 75119002 75116002 75112004
 75113001 75107001 75118001 75120002 75110001 75119004 75105001 75108001
 75114003 75112002 75116001 75101001 75114002 75120001 75106001 75119001
 75115001 75107004 75112003 75120004 75116005 75117001 75113002 75120005
 75120006 75107005 75115003 75116008 75114007]

Unique values in 'NOM_USUEL_x': ['PARIS-MONTSOURIS' 'ILE DE BERCY' 'PASSY' 'TOUR ST-JACQUES' 'ST-LOUIS'
 'VILLETTE' 'BAGATELLE' 'ST-ANTOINE' "PORTE D'IVRY" 'LAENNEC' 'MONTMARTRE'
 'MENIL.RESERVOIR' 'LARIBOISIERE' 'BUTTES RESERV.' 'PLANTES' 'LOUIS XVI'
 'OBS. TERRASSE' 'LA FAISANDERIE' 'AUTEUIL' 'INNOCENTS' 'OBSERVATOIRE'
 'CHARONNE' 'LUXEMBOURG' 'BUTTES CHAUMONT' 'VAUGIRARD' 'CHAMP DE MARS'
 'LEO LAGRANGE' 'BELLEVILLE' 'BATIGNOLLES' 'SALPETRIERE' 'TENON'
 'BELLEVILLE PARC' 'TOUR EIFFEL' 'G. POMPIDOU' 'LONGCHAMP'
 'PARIS-MONTSOURIS-DOUBLE']

Unique values in 'LAT_x': [48.821667 48.831667 48.87     48.858333 48.873333 48.8

In [42]:
merged_df.info

<bound method DataFrame.info of          NUM_POSTE_x       NOM_USUEL_x      LAT_x     LON_x  ALTI_x  AAAAMMJJ  \
0           75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
1           75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
2           75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
3           75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
4           75114001  PARIS-MONTSOURIS  48.821667  2.337833      75  19500101   
...              ...               ...        ...       ...     ...       ...   
2719127     75116008         LONGCHAMP  48.854833  2.233667      27  20221231   
2719128     75116008         LONGCHAMP  48.854833  2.233667      27  20221231   
2719129     75116008         LONGCHAMP  48.854833  2.233667      27  20221231   
2719130     75116008         LONGCHAMP  48.854833  2.233667      27  20221231   
2719131     75116008         LONGCHAMP  48.854833  2.233667      27  20221231

In [45]:
filtered_data = merged_df[
    (merged_df['NUM_POSTE_x'] == 75114001) &
    (merged_df['NOM_USUEL_x'] == 'PARIS-MONTSOURIS') &
    (merged_df['AAAAMMJJ'] == 19500101)
]

# Check each row in the filtered dataset
for index, row in filtered_data.iterrows():
    # Filter out all null values for the current row, retaining only non-null columns
    non_null_data = row[row.notna()]
    # Print the non-null columns and their values for the current row
    print(f"Row {index} non-null data:")
    print(non_null_data)
    print("\n")  # Adding a newline for better readability between rows

Row 0 non-null data:
NUM_POSTE_x            75114001
NOM_USUEL_x    PARIS-MONTSOURIS
LAT_x                 48.821667
LON_x                  2.337833
ALTI_x                       75
AAAAMMJJ               19500101
INST                      375.0
QINST                       9.0
SIGMA                      77.0
QSIGMA                      9.0
UN                         70.0
QUN                         1.0
UX                         92.0
QUX                         1.0
QNEIGETOTX                  1.0
NEIG                        0.0
QNEIG                       9.0
ORAG                        0.0
QORAG                       9.0
GRESIL                      0.0
QGRESIL                     9.0
GRELE                       0.0
QGRELE                      9.0
NUM_POSTE_y            75101001
NOM_USUEL_y           INNOCENTS
LAT_y                 48.860667
LON_y                  2.348333
ALTI_y                       37
RR                          0.0
QRR                         1.0
Name: 0, dtype: obj

In [75]:
import pandas as pd

# Assuming filtered_data is already defined as specified
filtered_data = merged_df[
    (merged_df['NUM_POSTE_x'] == 75114001) &
    (merged_df['NOM_USUEL_x'] == 'PARIS-MONTSOURIS') &
    (merged_df['AAAAMMJJ'] == 19500101)
]

non_null_rows = []  # List to store non-null rows

# Iterate over each row in the filtered dataset
for index, row in filtered_data.iterrows():
    # Filter out all null values for the current row, retaining only non-null columns
    non_null_data = row[row.notna()]
    
    # Add the non-null data to the list as a dictionary
    non_null_rows.append(non_null_data.to_dict())

    # Optionally print each non-null row
    # print(f"Row {index} non-null data:")
     #print(non_null_data)
    print("\n")  # Adding a newline for better readability between rows

# Create a new DataFrame from the list of non-null rows
non_null_dataframe = pd.DataFrame(non_null_rows)

# Check the new DataFrame
non_null_dataframe


row_11 = non_null_dataframe.iloc[11]  # Use .iloc to access by integer location

row_11


















































NUM_POSTE_x            75114001
NOM_USUEL_x    PARIS-MONTSOURIS
LAT_x                 48.821667
LON_x                  2.337833
ALTI_x                       75
AAAAMMJJ               19500101
INST                      375.0
QINST                       9.0
SIGMA                      77.0
QSIGMA                      9.0
UN                         70.0
QUN                         1.0
UX                         92.0
QUX                         1.0
QNEIGETOTX                  1.0
NEIG                        0.0
QNEIG                       9.0
ORAG                        0.0
QORAG                       9.0
GRESIL                      0.0
QGRESIL                     9.0
GRELE                       0.0
QGRELE                      9.0
NUM_POSTE_y            75114001
NOM_USUEL_y    PARIS-MONTSOURIS
LAT_y                 48.821667
LON_y                  2.337833
ALTI_y                       75
RR                          0.0
QRR                         1.0
TN                         -1.7
QTN     

In [1]:
import pandas as pd

# Load the datasets directly from the CSV files
df1 = pd.read_csv('Q_75_latest-2023-2024_autres-parametres.csv', sep=';')
df2 = pd.read_csv('Q_75_latest-2023-2024_RR-T-Vent.csv', sep=';')

# Merge the dataframes on 'AAAAMMJJ' and other relevant identifiers
merged_df = pd.merge(df1, df2, on=['AAAAMMJJ', 'NUM_POSTE', 'NOM_USUEL', 'LAT', 'LON'], how='outer')

# Drop columns where all elements are NaN
merged_df.dropna(axis=1, how='all', inplace=True)

# Display the merged DataFrame after dropping NaN-only columns
merged_df.head()

# Optionally, you can save this cleaned DataFrame to a new CSV
merged_df.to_csv('2023-2024-weather_cleaned.csv', index=False)


In [84]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 92 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   NUM_POSTE    2966 non-null   int64  
 1   NOM_USUEL    2966 non-null   object 
 2   LAT          2966 non-null   float64
 3   LON          2966 non-null   float64
 4   ALTI_x       2904 non-null   float64
 5   AAAAMMJJ     2966 non-null   int64  
 6   PMERM        484 non-null    float64
 7   QPMERM       484 non-null    float64
 8   PMERMIN      484 non-null    float64
 9   QPMERMIN     484 non-null    float64
 10  INST         968 non-null    float64
 11  QINST        968 non-null    float64
 12  GLOT         968 non-null    float64
 13  QGLOT        968 non-null    float64
 14  SIGMA        968 non-null    float64
 15  QSIGMA       968 non-null    float64
 16  UN           968 non-null    float64
 17  QUN          968 non-null    float64
 18  HUN          968 non-null    float64
 19  QHUN  

In [85]:
unique_dates = merged_df['AAAAMMJJ'].unique()
print(unique_dates)

[20230101 20230102 20230103 20230104 20230105 20230106 20230107 20230108
 20230109 20230110 20230111 20230112 20230113 20230114 20230115 20230116
 20230117 20230118 20230119 20230120 20230121 20230122 20230123 20230124
 20230125 20230126 20230127 20230128 20230129 20230130 20230131 20230201
 20230202 20230203 20230204 20230205 20230206 20230207 20230208 20230209
 20230210 20230211 20230212 20230213 20230214 20230215 20230216 20230217
 20230218 20230219 20230220 20230221 20230222 20230223 20230224 20230225
 20230226 20230227 20230228 20230301 20230302 20230303 20230304 20230305
 20230306 20230307 20230308 20230309 20230310 20230311 20230312 20230313
 20230314 20230315 20230316 20230317 20230318 20230319 20230320 20230321
 20230322 20230323 20230324 20230325 20230326 20230327 20230328 20230329
 20230330 20230331 20230401 20230402 20230403 20230404 20230405 20230406
 20230407 20230408 20230409 20230410 20230411 20230412 20230413 20230414
 20230415 20230416 20230417 20230418 20230419 20230

In [86]:
import pandas as pd

# Load the datasets directly from the CSV files
df1 = pd.read_csv('Q_75_previous-1950-2022_RR-T-Vent.csv', sep=';')
df2 = pd.read_csv('Q_75_previous-1950-2022_autres-parametres.csv', sep=';')

# Assuming 'AAAAMMJJ' is the date column and both dataframes share this common column
# Adjust the column names according to your actual data structure if necessary
merged_df2 = pd.merge(df1, df2, on=['AAAAMMJJ', 'NUM_POSTE', 'NOM_USUEL', 'LAT', 'LON'], how='outer')
# Drop columns where all elements are NaN
merged_df2.dropna(axis=1, how='all', inplace=True)
# Display the merged DataFrame
merged_df2.head()

MemoryError: Unable to allocate 313. MiB for an array with shape (83, 494315) and data type float64

In [87]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Merge CSV Data") \
    .getOrCreate()

# Load the datasets
df1 = spark.read.csv('Q_75_previous-1950-2022_RR-T-Vent.csv', header=True, sep=';', inferSchema=True)
df2 = spark.read.csv('Q_75_previous-1950-2022_autres-parametres.csv', header=True, sep=';', inferSchema=True)

# Assuming 'AAAAMMJJ' is the date column and both dataframes share this common column
# Adjust the column names according to your actual data structure if necessary
merged_df = df1.join(df2, ['AAAAMMJJ', 'NUM_POSTE', 'NOM_USUEL', 'LAT', 'LON'], how='outer')

# Drop columns where all elements are NaN
# In PySpark, you'd typically handle this by filtering or using dropna if you know specific columns
# For dropping columns with all nulls, we need to compute this conditionally
for column in merged_df.columns:
    non_null_count = merged_df.filter(col(column).isNotNull()).count()
    if non_null_count == 0:
        merged_df = merged_df.drop(column)

# Show the result
merged_df.show()

ModuleNotFoundError: No module named 'pyspark'