## Importation des données

In [2]:
import pandas as pd

# path = "/home/jovyan/hfactory_magic_folders/shared_storage/data/X_train_Hi5.csv" 
path = "X_train_Hi5.csv" 

df = pd.read_csv(path, low_memory=False)

In [3]:
df.head()

Unnamed: 0,row_index,piezo_station_department_code,piezo_station_update_date,piezo_station_investigation_depth,piezo_station_department_name,piezo_station_commune_code_insee,piezo_station_pe_label,piezo_station_bdlisa_codes,piezo_station_altitude,piezo_station_bss_code,...,prelev_longitude_2,prelev_latitude_2,prelev_commune_code_insee_2,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_med_living_level,insee_%_ind,insee_%_const,piezo_groundwater_level_category
0,0,1,Sun Jul 14 13:00:02 CEST 2024,20.0,Ain,1073,PIEZOMETRE - MARAIS DE LAVOURS (CEYZERIEU - BR...,['712AH37'],232.0,07004X0046/D6-20,...,5.698947,45.725106,1454.0,1793055000.0,11.8,992.0,25250,2.9,16.2,High
1,1,1,Sun Jul 14 13:00:02 CEST 2024,35.6,Ain,1363,PIEZOMETRE - GRAVIERE (ST-JEAN-LE-VIEUX - BRGM...,['712GB05'],247.25,06754X0077/F1,...,5.464933,46.210734,1051.0,1085125000.0,0.6,1786.0,24660,44.5,11.0,Very High
2,2,1,Sun Jul 14 13:00:02 CEST 2024,35.22,Ain,1244,PIEZOMETRE - BORD AUTOROUTE (MEXIMIEUX - BRGM ...,['040AJ43'],218.77,06993X0226/MEXI_2,...,5.08506,45.812828,69266.0,381049200.0,0.0,8085.0,24890,8.4,7.8,High
3,3,1,Sun Jul 14 13:00:02 CEST 2024,34.2,Ain,1288,PIEZOMETRE - GRENY (PERON - BRGM 01) - BSH,"['516AA00', '516AF00']",499.85,06533X0070/F2,...,5.802841,46.366049,39286.0,380091100.0,1.5,2838.0,39700,2.4,5.2,Very High
4,4,1,Sun Jul 14 13:00:02 CEST 2024,37.3,Ain,1422,FORAGE - ENCLOS (TOSSIAT - BRGM 01) - BSH,['507AB00'],260.0,06518X0026/P2,...,5.377265,46.080989,1273.0,19666310.0,0.2,1352.0,26180,21.5,9.8,Very Low


## Preprocessing

In [22]:

def preprocess_cat(df, useful_path: str, output_flag: int=0):
    useful_columns = []

    with open(useful_path, 'r') as file:
        useful_columns = [f[:-1] for f in file.readlines()]

    if output_flag:
        useful_columns.remove('piezo_groundwater_level_category')
    
    df = df[useful_columns]
    
    #Calculer le pourcentage de valeurs manquantes
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    threshold = 70
    columns_to_drop = missing_percentage[missing_percentage > threshold].index

    updated_df = df.drop(columns=columns_to_drop)

    updated_df['piezo_measurement_date'] = pd.to_datetime(updated_df['piezo_measurement_date']).astype(int) // 10**9
    updated_df['hydro_observation_date_elab'] = pd.to_datetime(updated_df['hydro_observation_date_elab']).astype(int) // 10**9
    
    encoding = {
    "Very Low": 1,
    "Low": 2,
    "Average": 3,
    "High": 4,
    "Very High": 5
    }

    if output_flag == 0:
        updated_df['piezo_groundwater_level_category'] = updated_df['piezo_groundwater_level_category'].map(encoding)

    status_encoding = {
        "Donnée brute": 1,
        "Donnée contrôlée niveau 1": 2,
        "Donnée contrôlée niveau 2": 3,
        "Donnée interprétée": 4
    }

    updated_df['piezo_status'] = updated_df['piezo_status'].map(status_encoding)

    #piezo_qualification
    qualification_encoding = {
        "Incorrecte": 1,
        "Incertaine": 2,
        "Correcte": 3,
        "Non qualifié": 4
    }

    updated_df['piezo_qualification'] = updated_df['piezo_qualification'].map(qualification_encoding)
    
    updated_df = pd.get_dummies(updated_df, columns=['hydro_hydro_quantity_elab'], prefix='quantity')
    updated_df[['quantity_QmJ', 'quantity_QmM']] = updated_df[['quantity_QmJ', 'quantity_QmM']].astype(int)

    updated_df['prelev_usage_label_0'] = updated_df['prelev_usage_label_0'].fillna('Non spécifié')
    updated_df = pd.get_dummies(updated_df, columns=['prelev_usage_label_0'], prefix='usage')

    columns_usage = updated_df.filter(like='usage_').columns
    updated_df[columns_usage] = updated_df[columns_usage].astype(int)


    updated_df['prelev_volume_obtention_mode_label_0'] = updated_df['prelev_volume_obtention_mode_label_0'].fillna('Non spécifié')
    updated_df = pd.get_dummies(updated_df, columns=['prelev_volume_obtention_mode_label_0'], prefix='mode')

    columns_usage_2 = updated_df.filter(like='mode_').columns
    updated_df[columns_usage_2] = updated_df[columns_usage_2].astype(int)


    updated_df['insee_%_agri'] = updated_df['insee_%_agri'].replace('N/A - division par 0', None)
    updated_df['insee_%_const'] = updated_df['insee_%_const'].replace('N/A - division par 0', None)
    updated_df['insee_%_ind'] = updated_df['insee_%_ind'].replace('N/A - division par 0', None)


    updated_df['insee_%_agri'] = pd.to_numeric(updated_df['insee_%_agri'], errors='coerce')
    updated_df['insee_%_const'] = pd.to_numeric(updated_df['insee_%_const'], errors='coerce')
    updated_df['insee_%_ind'] = pd.to_numeric(updated_df['insee_%_ind'], errors='coerce')


    updated_df['insee_%_agri'] = updated_df['insee_%_agri'].fillna(updated_df['insee_%_agri'].mean())
    updated_df['insee_%_const'] = updated_df['insee_%_const'].fillna(updated_df['insee_%_const'].mean())
    updated_df['insee_%_ind'] = updated_df['insee_%_ind'].fillna(updated_df['insee_%_ind'].mean())

    
    return updated_df

In [6]:
ud = preprocess_cat(df, 'useful_columns.txt')
ud.head()

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_groundwater_level_category,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,...,usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Mesure indirecte,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,20.0,232.0,5.748241,45.828148,1577836800,4,3,3,2.0,577.0,...,0,0,0,0,0,1,0,0,0,0
1,35.6,247.25,5.356637,46.028102,1577836800,5,3,3,2.0,1440.0,...,0,0,0,0,0,1,0,0,0,0
2,35.22,218.77,5.220795,45.895734,1577836800,4,3,3,2.0,1228.0,...,0,0,0,0,0,1,0,0,0,0
3,34.2,499.85,5.948977,46.20118,1577836800,5,3,3,2.0,,...,0,0,0,0,0,1,0,0,0,0
4,37.3,260.0,5.313353,46.136402,1577836800,1,3,3,2.0,1103.0,...,0,0,0,0,0,1,0,0,0,0


In [7]:
pd.set_option('display.max_columns', None)
ud.head()

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_groundwater_level_category,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,meteo_humidity_duration_below_40%,meteo_humidity_duration_above_80%,meteo_wind_direction_max_avg,meteo_evapotranspiration_grid,meteo_longitude,meteo_latitude,meteo_rain_height,meteo_amplitude_tn_tx,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo_temperature_min,meteo_temperature_avg_tntm,meteo__pressure_saturation_avg,meteo_temperature_max,meteo_humidity_avg,meteo_humidity_min,meteo_humidity_max,hydro_observation_date_elab,hydro_observation_result_elab,hydro_longitude,hydro_latitude,prelev_volume_0,prelev_longitude_0,prelev_latitude_0,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_%_ind,insee_%_const,quantity_QmJ,quantity_QmM,usage_CANAUX,usage_EAU POTABLE,usage_EAU TURBINEE (barrage),usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Mesure indirecte,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,20.0,232.0,5.748241,45.828148,1577836800,4,3,3,2.0,577.0,0.0,1440.0,30.0,0.5,45.769333,5.688,0.2,5.5,0.7,1.15,-1.6,1.2,6.0,3.9,94.0,82.0,99.0,1577836800,1833.0,5.685831,45.874843,10743150000.0,5.811394,45.878862,1793055000.0,11.8,992.0,2.9,16.2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,35.6,247.25,5.356637,46.028102,1577836800,5,3,3,2.0,1440.0,0.0,1440.0,360.0,0.2,45.9765,5.329333,0.0,2.2,-1.2,-1.1,-2.2,-1.1,5.5,0.0,97.0,97.0,98.0,1577836800,93389.0,5.336428,46.047082,12492020000.0,5.405995,45.853165,1085125000.0,0.6,1786.0,44.5,11.0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
2,35.22,218.77,5.220795,45.895734,1577836800,4,3,3,2.0,1228.0,0.0,1440.0,,0.0,45.833,5.106667,,2.0,-0.6,-0.4,-1.4,-0.4,5.8,0.6,99.0,99.0,99.0,1577836800,108237.0,5.233492,45.906423,12492020000.0,5.405995,45.853165,381049200.0,0.0,8085.0,8.4,7.8,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,34.2,499.85,5.948977,46.20118,1577836800,5,3,3,2.0,,,,,0.9,46.302833,5.843,0.0,10.0,0.4,2.7,-2.3,2.7,4.3,7.7,68.0,,,1577836800,305464.0,5.964334,46.137641,10382890000.0,5.812855,46.052623,380091100.0,1.5,2838.0,2.4,5.2,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,37.3,260.0,5.313353,46.136402,1577836800,1,3,3,2.0,1103.0,,,,0.1,46.204333,5.287667,0.0,2.5,-0.8,-0.85,-2.1,-0.9,,0.4,,,,1577836800,85.0,5.330754,46.143007,2206200000.0,5.421808,46.11202,19666310.0,0.2,1352.0,21.5,9.8,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

def Fill_Reg(df, missing_col, Regressor):
    df_complete = df[df[missing_col].notnull()]
    df_missing = df[df[missing_col].isnull()]
    
    if df_missing.empty:
        return df

    columns_to_train = df.columns.difference([missing_col])
    X_complete = df_complete[columns_to_train]
    y_complete = df_complete[missing_col]
    X_missing = df_missing[columns_to_train]
        
    X_missing = X_missing.reindex(columns=X_complete.columns, fill_value=0)
    
    n = 10_000
    Xc_train, Xc_test, yc_train, yc_test = train_test_split(
        X_complete.iloc[:n], y_complete[:n], test_size=0.1
    )
    Regressor.fit(Xc_train, yc_train)
    
    score = Regressor.score(Xc_test, yc_test)
    print(f"Accuracy: {score}")
    
    if score < 0.7:
        print("Score too low...")
        return
    
    predicted_values = Regressor.predict(X_missing)
    
    return df_missing.index, predicted_values


In [67]:
from sklearn.ensemble import RandomForestRegressor

def preprocess_cont(df):
    missing_percentage_ = (df.isnull().sum() / len(df)) * 100
    
    columns_to_mean = missing_percentage_[missing_percentage_ < 40].index

    for col in columns_to_mean:
        df[col].fillna(df[col].mean(), inplace=True)

    threshold = 40
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    columns_to_train = missing_percentage[missing_percentage > threshold].index
    training_df = df.drop(columns=columns_to_train)
    
    filled_df = df.copy()
    for c in columns_to_train:
        # print(f"################### {c} ####################")
        # couple = Fill_Reg(df, c, RandomForestRegressor())
        # if couple is not None:
        #     i, pv = couple
        #     filled_df.loc[i, c] = pv
        df[c].fillna(df[c].mean(), inplace=True)
    
    return df

In [68]:
fd = preprocess_cont(ud)
fd.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mean(), inplace=True)


Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_groundwater_level_category,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,meteo_humidity_duration_below_40%,meteo_humidity_duration_above_80%,meteo_wind_direction_max_avg,meteo_evapotranspiration_grid,meteo_longitude,meteo_latitude,meteo_rain_height,meteo_amplitude_tn_tx,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo_temperature_min,meteo_temperature_avg_tntm,meteo__pressure_saturation_avg,meteo_temperature_max,meteo_humidity_avg,meteo_humidity_min,meteo_humidity_max,hydro_observation_date_elab,hydro_observation_result_elab,hydro_longitude,hydro_latitude,prelev_volume_0,prelev_longitude_0,prelev_latitude_0,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_%_ind,insee_%_const,quantity_QmJ,quantity_QmM,usage_CANAUX,usage_EAU POTABLE,usage_EAU TURBINEE (barrage),usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Mesure indirecte,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,20.0,232.0,5.748241,45.828148,1577836800,4,3,3,2.0,577.0,0.0,1440.0,30.0,0.5,45.769333,5.688,0.2,5.5,0.7,1.15,-1.6,1.2,6.0,3.9,94.0,82.0,99.0,1577836800,1833.0,5.685831,45.874843,10743150000.0,5.811394,45.878862,1793055000.0,11.8,992.0,2.9,16.2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,35.6,247.25,5.356637,46.028102,1577836800,5,3,3,2.0,1440.0,0.0,1440.0,360.0,0.2,45.9765,5.329333,0.0,2.2,-1.2,-1.1,-2.2,-1.1,5.5,0.0,97.0,97.0,98.0,1577836800,93389.0,5.336428,46.047082,12492020000.0,5.405995,45.853165,1085125000.0,0.6,1786.0,44.5,11.0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
2,35.22,218.77,5.220795,45.895734,1577836800,4,3,3,2.0,1228.0,0.0,1440.0,198.903444,0.0,45.833,5.106667,2.071651,2.0,-0.6,-0.4,-1.4,-0.4,5.8,0.6,99.0,99.0,99.0,1577836800,108237.0,5.233492,45.906423,12492020000.0,5.405995,45.853165,381049200.0,0.0,8085.0,8.4,7.8,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,34.2,499.85,5.948977,46.20118,1577836800,5,3,3,2.0,60.030968,73.388852,746.414666,198.903444,0.9,46.302833,5.843,0.0,10.0,0.4,2.7,-2.3,2.7,4.3,7.7,68.0,54.440383,93.08505,1577836800,305464.0,5.964334,46.137641,10382890000.0,5.812855,46.052623,380091100.0,1.5,2838.0,2.4,5.2,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,37.3,260.0,5.313353,46.136402,1577836800,1,3,3,2.0,1103.0,73.388852,746.414666,198.903444,0.1,46.204333,5.287667,0.0,2.5,-0.8,-0.85,-2.1,-0.9,10.970493,0.4,76.200736,54.440383,93.08505,1577836800,85.0,5.330754,46.143007,2206200000.0,5.421808,46.11202,19666310.0,0.2,1352.0,21.5,9.8,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [71]:
fd.to_csv('complete_mean.csv')

In [72]:
df_test = pd.read_csv('X_test_Hi5.csv')
df_test.head()

  df_test = pd.read_csv('X_test_Hi5.csv')


Unnamed: 0,row_index,piezo_station_department_code,piezo_station_update_date,piezo_station_investigation_depth,piezo_station_department_name,piezo_station_commune_code_insee,piezo_station_pe_label,piezo_station_bdlisa_codes,piezo_station_altitude,piezo_station_bss_code,piezo_station_commune_name,piezo_station_longitude,piezo_station_latitude,piezo_station_bss_id,piezo_bss_code,piezo_measurement_date,piezo_obtention_mode,piezo_status,piezo_qualification,piezo_continuity_code,piezo_continuity_name,piezo_producer_code,piezo_producer_name,piezo_measure_nature_code,piezo_measure_nature_name,meteo_id,meteo_name,meteo_latitude,meteo_longitude,meteo_altitude,meteo_date,meteo_rain_height,meteo_DRR,meteo_temperature_min,meteo_time_tn,meteo_temperature_max,meteo_time_tx,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo_temperature_min_ground,meteo_temperature_min_50cm,meteo_frost_duration,meteo_amplitude_tn_tx,meteo_temperature_avg_tntm,meteo_pressure_avg,meteo_pression_maxi,meteo_wind_speed_avg_10m,meteo_wind_max,meteo_wind_direction_max_inst,meteo_time_wind_max,meteo_wind_avg,meteo_wind_direction_max_avg,meteo_time_wind_avg,meteo_wind_speed_avg_2m,meteo_wind_max_2m,meteo_wind_direction_max_inst_2m,meteo_time_wind_max_2m,meteo_wind_max_3s,meteo_time_wind_max_3s,meteo_humidity_min,meteo_time_humidity_min,meteo_humidity_max,meteo_time_humidity_max,meteo_humidity_duration_below_40%,meteo_humidity_duration_above_80%,meteo__pressure_saturation_avg,meteo_wetting_duration,meteo_humidity_avg,meteo_sunshine_duration,meteo_radiation,meteo_radiation_direct,meteo_sunshine_%,meteo_radiation_IR,meteo_radiation_UV_max,meteo_cloudiness,meteo_cloudiness_height,meteo_if_snow,meteo_if_fog,meteo_if_thunderstorm,meteo_if_sleet,meteo_if_hail,meteo_if_dew,meteo_if_black_ice,meteo_if_snow_ground,meteo_if_frost,meteo_if_smoke,meteo_if_mist,meteo_if_lightning,meteo_evapotranspiration_Monteith,meteo_evapotranspiration_grid,meteo_radiation_UV,meteo_snow_height,meteo_snow_thickness_max,meteo_snow_thickness_6h,distance_piezo_meteo,hydro_station_code,hydro_observation_date_elab,hydro_observation_result_elab,hydro_status_code,hydro_status_label,hydro_method_code,hydro_method_label,hydro_qualification_code,hydro_qualification_label,hydro_longitude,hydro_latitude,hydro_hydro_quantity_elab,distance_piezo_hydro,prelev_structure_code_0,prelev_volume_0,prelev_usage_label_0,prelev_volume_obtention_mode_label_0,prelev_longitude_0,prelev_latitude_0,prelev_commune_code_insee_0,prelev_structure_code_1,prelev_volume_1,prelev_usage_label_1,prelev_volume_obtention_mode_label_1,prelev_longitude_1,prelev_latitude_1,prelev_commune_code_insee_1,prelev_structure_code_2,prelev_volume_2,prelev_usage_label_2,prelev_volume_obtention_mode_label_2,prelev_longitude_2,prelev_latitude_2,prelev_commune_code_insee_2,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_med_living_level,insee_%_ind,insee_%_const
0,2331795,1,Sun Jul 14 13:00:02 CEST 2024,20.0,Ain,1073,PIEZOMETRE - MARAIS DE LAVOURS (CEYZERIEU - BR...,['712AH37'],232.0,07004X0046/D6-20,Ceyzérieu,5.748241,45.828148,BSS001TTQQ,07004X0046/D6-20,2022-06-01,Valeur mesurée,Donnée contrôlée niveau 2,Correcte,2.0,Point lié au point précédent,196.0,Service Géologique Régional Rhône-Alpes (196),N,Naturel,1034004,BELLEY,5.688,45.769333,330,2022-06-01,7.4,,12.8,335.0,23.2,1556.0,17.1,18.0,12.1,,0.0,10.4,18.0,,,1.3,6.6,100.0,2257.0,3.1,90.0,2301.0,,,,,6.2,2257.0,57.0,1559.0,98.0,1.0,0.0,1044.0,16.5,,85.0,,,,,,,,,,,,,,,,,,,,,,2.7,,,,,8.036207,V142501001,2022-06-01,191.0,16,Donnée validée,12,Interpolation,16,Non qualifiée,5.685831,45.874843,QmM,7.093852,,,,,,,,,,,,,,,,,,,,,,0.0,11.8,992.0,25250,2.9,16.2
1,2331796,1,Sun Jul 14 13:00:02 CEST 2024,35.6,Ain,1363,PIEZOMETRE - GRAVIERE (ST-JEAN-LE-VIEUX - BRGM...,['712GB05'],247.25,06754X0077/F1,Saint-Jean-le-Vieux,5.356637,46.028102,BSS001SCTM,06754X0077/F1,2022-06-01,Valeur mesurée,Donnée contrôlée niveau 2,Correcte,2.0,Point lié au point précédent,196.0,Service Géologique Régional Rhône-Alpes (196),N,Naturel,1089001,AMBERIEU,5.329333,45.9765,250,2022-06-01,3.4,,10.5,429.0,23.1,1716.0,16.8,16.8,9.0,10.5,0.0,12.6,16.8,1015.8,1013.7,1.7,6.7,20.0,1717.0,,,,,,,,,,54.0,1506.0,98.0,115.0,0.0,949.0,15.4,,82.0,234.0,1395.0,,26.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,2.5,2.5,,,0.0,0.0,6.113148,V271201001,2022-06-01,13346.0,16,Donnée validée,12,Interpolation,20,Bonne,5.336428,46.047082,QmJ,2.624373,,,,,,,,,,,,,,,,,,,,,,0.0,0.6,1786.0,24660,44.5,11.0
2,2331797,1,Sun Jul 14 13:00:02 CEST 2024,35.22,Ain,1244,PIEZOMETRE - BORD AUTOROUTE (MEXIMIEUX - BRGM ...,['040AJ43'],218.77,06993X0226/MEXI_2,Meximieux,5.220795,45.895734,BSS001TRPH,06993X0226/MEXI_2,2022-06-01,Valeur mesurée,Donnée contrôlée niveau 2,Correcte,2.0,Point lié au point précédent,196.0,Service Géologique Régional Rhône-Alpes (196),N,Naturel,1027003,BALAN_AERO,5.106667,45.833,196,2022-06-01,,,12.1,357.0,24.9,1506.0,18.0,18.5,,,0.0,12.8,18.5,,,,,,,,,,,,,,,,55.0,1439.0,100.0,431.0,0.0,956.0,16.9,,83.0,,,,,,,,,,,,,,,,,,,,,,2.8,,,,,11.258567,V294201001,2022-06-01,16168.0,16,Donnée validée,12,Interpolation,20,Bonne,5.233492,45.906423,QmJ,1.542071,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,8085.0,24890,8.4,7.8
3,2331798,1,Sun Jul 14 13:00:02 CEST 2024,34.2,Ain,1288,PIEZOMETRE - GRENY (PERON - BRGM 01) - BSH,"['516AA00', '516AF00']",499.85,06533X0070/F2,Péron,5.948977,46.20118,BSS001RGXM,06533X0070/F2,2022-06-01,Valeur mesurée,Donnée contrôlée niveau 2,Correcte,2.0,Point lié au point précédent,196.0,Service Géologique Régional Rhône-Alpes (196),N,Naturel,39413001,LA PESSE,5.843,46.302833,1133,2022-06-01,12.8,,5.8,311.0,17.1,1625.0,11.9,11.45,,,0.0,11.3,11.5,,,3.1,11.1,230.0,823.0,8.5,200.0,1044.0,,,,,10.4,841.0,59.0,1614.0,97.0,234.0,0.0,840.0,11.5,,83.0,142.0,1061.0,,15.0,,,,,,,,,,,,,,,,,1.7,1.5,,,,,13.93429,V100001002,2022-06-01,333590.0,16,Donnée validée,12,Interpolation,20,Bonne,5.964334,46.137641,QmM,7.16347,,,,,,,,,,,,,,,,,,,,,,0.0,1.5,2838.0,39700,2.4,5.2
4,2331799,1,Sun Jul 14 13:00:02 CEST 2024,37.3,Ain,1422,FORAGE - ENCLOS (TOSSIAT - BRGM 01) - BSH,['507AB00'],260.0,06518X0026/P2,Tossiat,5.313353,46.136402,BSS001RFRV,06518X0026/P2,2022-06-01,Valeur mesurée,Donnée contrôlée niveau 2,Correcte,2.0,Point lié au point précédent,196.0,Service Géologique Régional Rhône-Alpes (196),N,Naturel,1072001,CEYZERIAT_SAPC,5.287667,46.204333,260,2022-06-01,2.2,,9.5,300.0,22.6,1726.0,16.4,16.05,,,,13.1,16.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,7.808228,U400000101,2022-06-01,13.0,16,Donnée validée,12,Interpolation,20,Bonne,5.330754,46.143007,QmM,1.528669,,,,,,,,,,,,,,,,,,,,,,0.0,0.2,1352.0,26180,21.5,9.8


In [73]:
fp_test = preprocess_cat(df_test, 'useful_columns.txt', output_flag=1)
fp_test.head()

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,meteo_humidity_duration_below_40%,meteo_humidity_duration_above_80%,meteo_wind_direction_max_avg,meteo_evapotranspiration_grid,meteo_longitude,meteo_latitude,meteo_rain_height,meteo_amplitude_tn_tx,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo_temperature_min,meteo_temperature_avg_tntm,meteo__pressure_saturation_avg,meteo_temperature_max,meteo_humidity_avg,meteo_humidity_min,meteo_humidity_max,hydro_observation_date_elab,hydro_observation_result_elab,hydro_longitude,hydro_latitude,prelev_volume_0,prelev_longitude_0,prelev_latitude_0,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_%_ind,insee_%_const,quantity_QmJ,quantity_QmM,usage_CANAUX,usage_EAU POTABLE,usage_EAU TURBINEE (barrage),usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,20.0,232.0,5.748241,45.828148,1654041600,3,3,2.0,0.0,0.0,1044.0,90.0,2.7,45.769333,5.688,7.4,10.4,17.1,18.0,12.8,18.0,16.5,23.2,85.0,57.0,98.0,1654041600,191.0,5.685831,45.874843,,,,0.0,11.8,992.0,2.9,16.2,0,1,0,0,0,0,0,0,1,0,1,0,0,0
1,35.6,247.25,5.356637,46.028102,1654041600,3,3,2.0,0.0,0.0,949.0,,2.5,45.9765,5.329333,3.4,12.6,16.8,16.8,10.5,16.8,15.4,23.1,82.0,54.0,98.0,1654041600,13346.0,5.336428,46.047082,,,,0.0,0.6,1786.0,44.5,11.0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
2,35.22,218.77,5.220795,45.895734,1654041600,3,3,2.0,0.0,0.0,956.0,,2.8,45.833,5.106667,,12.8,18.0,18.5,12.1,18.5,16.9,24.9,83.0,55.0,100.0,1654041600,16168.0,5.233492,45.906423,,,,0.0,0.0,8085.0,8.4,7.8,1,0,0,0,0,0,0,0,1,0,1,0,0,0
3,34.2,499.85,5.948977,46.20118,1654041600,3,3,2.0,0.0,0.0,840.0,200.0,1.5,46.302833,5.843,12.8,11.3,11.9,11.45,5.8,11.5,11.5,17.1,83.0,59.0,97.0,1654041600,333590.0,5.964334,46.137641,,,,0.0,1.5,2838.0,2.4,5.2,0,1,0,0,0,0,0,0,1,0,1,0,0,0
4,37.3,260.0,5.313353,46.136402,1654041600,3,3,2.0,,,,,2.0,46.204333,5.287667,2.2,13.1,16.4,16.05,9.5,16.1,,22.6,,,,1654041600,13.0,5.330754,46.143007,,,,0.0,0.2,1352.0,21.5,9.8,0,1,0,0,0,0,0,0,1,0,1,0,0,0


In [74]:
sp_test = preprocess_cont(fp_test)
sp_test.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mean(), inplace=True)


Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,meteo_humidity_duration_below_40%,meteo_humidity_duration_above_80%,meteo_wind_direction_max_avg,meteo_evapotranspiration_grid,meteo_longitude,meteo_latitude,meteo_rain_height,meteo_amplitude_tn_tx,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo_temperature_min,meteo_temperature_avg_tntm,meteo__pressure_saturation_avg,meteo_temperature_max,meteo_humidity_avg,meteo_humidity_min,meteo_humidity_max,hydro_observation_date_elab,hydro_observation_result_elab,hydro_longitude,hydro_latitude,prelev_volume_0,prelev_longitude_0,prelev_latitude_0,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_%_ind,insee_%_const,quantity_QmJ,quantity_QmM,usage_CANAUX,usage_EAU POTABLE,usage_EAU TURBINEE (barrage),usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,20.0,232.0,5.748241,45.828148,1654041600,3,3,2.0,0.0,0.0,1044.0,90.0,2.7,45.769333,5.688,7.4,10.4,17.1,18.0,12.8,18.0,16.5,23.2,85.0,57.0,98.0,1654041600,191.0,5.685831,45.874843,396299900.0,1.387185,46.809041,0.0,11.8,992.0,2.9,16.2,0,1,0,0,0,0,0,0,1,0,1,0,0,0
1,35.6,247.25,5.356637,46.028102,1654041600,3,3,2.0,0.0,0.0,949.0,210.735124,2.5,45.9765,5.329333,3.4,12.6,16.8,16.8,10.5,16.8,15.4,23.1,82.0,54.0,98.0,1654041600,13346.0,5.336428,46.047082,396299900.0,1.387185,46.809041,0.0,0.6,1786.0,44.5,11.0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
2,35.22,218.77,5.220795,45.895734,1654041600,3,3,2.0,0.0,0.0,956.0,210.735124,2.8,45.833,5.106667,1.836326,12.8,18.0,18.5,12.1,18.5,16.9,24.9,83.0,55.0,100.0,1654041600,16168.0,5.233492,45.906423,396299900.0,1.387185,46.809041,0.0,0.0,8085.0,8.4,7.8,1,0,0,0,0,0,0,0,1,0,1,0,0,0
3,34.2,499.85,5.948977,46.20118,1654041600,3,3,2.0,0.0,0.0,840.0,200.0,1.5,46.302833,5.843,12.8,11.3,11.9,11.45,5.8,11.5,11.5,17.1,83.0,59.0,97.0,1654041600,333590.0,5.964334,46.137641,396299900.0,1.387185,46.809041,0.0,1.5,2838.0,2.4,5.2,0,1,0,0,0,0,0,0,1,0,1,0,0,0
4,37.3,260.0,5.313353,46.136402,1654041600,3,3,2.0,0.013083,180.542041,502.205748,210.735124,2.0,46.204333,5.287667,2.2,13.1,16.4,16.05,9.5,16.1,16.073066,22.6,67.394013,41.806136,90.597767,1654041600,13.0,5.330754,46.143007,396299900.0,1.387185,46.809041,0.0,0.2,1352.0,21.5,9.8,0,1,0,0,0,0,0,0,1,0,1,0,0,0


In [75]:
sp_test.to_csv('complete_test_mean.csv')

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
n = 100_000
fd_less = fd.drop(columns=['mode_Mesure indirecte'])

In [78]:
X, y = fd_less.drop(columns=['piezo_groundwater_level_category']), fd_less['piezo_groundwater_level_category']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model training n°1

In [None]:
model = RandomForestClassifier().fit(X_train, y_train)
model.score(X_test, y_test)

In [1]:
import pandas as pd

train_df = pd.read_csv('complete_mean.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_groundwater_level_category,piezo_status,piezo_qualification,piezo_continuity_code,...,usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Mesure indirecte,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,0,20.0,232.0,5.748241,45.828148,1577836800,4,3,3,2.0,...,0,0,0,0,0,1,0,0,0,0
1,1,35.6,247.25,5.356637,46.028102,1577836800,5,3,3,2.0,...,0,0,0,0,0,1,0,0,0,0
2,2,35.22,218.77,5.220795,45.895734,1577836800,4,3,3,2.0,...,0,0,0,0,0,1,0,0,0,0
3,3,34.2,499.85,5.948977,46.20118,1577836800,5,3,3,2.0,...,0,0,0,0,0,1,0,0,0,0
4,4,37.3,260.0,5.313353,46.136402,1577836800,1,3,3,2.0,...,0,0,0,0,0,1,0,0,0,0


In [2]:
test_df = pd.read_csv('complete_test_mean.csv')
test_df.head()

Unnamed: 0.1,Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_measurement_date,piezo_status,piezo_qualification,piezo_continuity_code,meteo_frost_duration,...,usage_EAU TURBINEE (barrage),usage_ENERGIE,"usage_INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)",usage_IRRIGATION,usage_Non spécifié,mode_Mesure directe,mode_Non spécifié,mode_Volume estimé,mode_Volume forfaitaire,mode_Volume mesuré
0,0,20.0,232.0,5.748241,45.828148,1654041600,3,3,2.0,0.0,...,0,0,0,0,1,0,1,0,0,0
1,1,35.6,247.25,5.356637,46.028102,1654041600,3,3,2.0,0.0,...,0,0,0,0,1,0,1,0,0,0
2,2,35.22,218.77,5.220795,45.895734,1654041600,3,3,2.0,0.0,...,0,0,0,0,1,0,1,0,0,0
3,3,34.2,499.85,5.948977,46.20118,1654041600,3,3,2.0,0.0,...,0,0,0,0,1,0,1,0,0,0
4,4,37.3,260.0,5.313353,46.136402,1654041600,3,3,2.0,0.013083,...,0,0,0,0,1,0,1,0,0,0


## Model training n°2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

X, y = train_df.drop(columns=['piezo_groundwater_level_category']), train_df['piezo_groundwater_level_category'] - 1
X.drop(columns=['mode_Mesure indirecte'], inplace=True)


continuous_columns = [col for col in X.columns if X[col].dtype in ['float64', 'float32']]
categorical_columns = [col for col in X.columns if X[col].dtype == 'object' or X[col].dtype.name == 'category']
discrete_columns = [col for col in X.columns if X[col].dtype in ['int64', 'int32'] and len(X[col].unique()) < 20]

categorical_columns += discrete_columns


X_continuous = X[continuous_columns]
X_categorical = X[categorical_columns]

scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X_continuous)

X_continuous_scaled = pd.DataFrame(X_continuous_scaled, columns=continuous_columns)


X_processed = pd.concat([X_continuous_scaled, X_categorical.reset_index(drop=True)], axis=1)

print(f"Continuous Columns: {continuous_columns}")
print(f"Categorical Columns: {categorical_columns}")
print(f"Processed Data Shape: {X_processed.shape}")


Continuous Columns: ['piezo_station_investigation_depth', 'piezo_station_altitude', 'piezo_station_longitude', 'piezo_station_latitude', 'piezo_continuity_code', 'meteo_frost_duration', 'meteo_humidity_duration_below_40%', 'meteo_humidity_duration_above_80%', 'meteo_wind_direction_max_avg', 'meteo_evapotranspiration_grid', 'meteo_longitude', 'meteo_latitude', 'meteo_rain_height', 'meteo_amplitude_tn_tx', 'meteo_temperature_avg', 'meteo_temperature_avg_threshold', 'meteo_temperature_min', 'meteo_temperature_avg_tntm', 'meteo__pressure_saturation_avg', 'meteo_temperature_max', 'meteo_humidity_avg', 'meteo_humidity_min', 'meteo_humidity_max', 'hydro_observation_result_elab', 'hydro_longitude', 'hydro_latitude', 'prelev_volume_0', 'prelev_longitude_0', 'prelev_latitude_0', 'prelev_other_volume_sum', 'insee_%_agri', 'insee_pop_commune', 'insee_%_ind', 'insee_%_const']
Categorical Columns: ['piezo_status', 'piezo_qualification', 'quantity_QmJ', 'quantity_QmM', 'usage_CANAUX', 'usage_EAU POTA

In [6]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X_processed, y,  # Subtract 1 to make labels 0-based
    test_size=0.2
)

rfc = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.8124205036886288

## Model training n°3

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_processed, y,  
    test_size=0.2
)


X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)


y_train = torch.tensor(y_train.values, dtype=torch.long)  # Changed to long (int64)
y_test = torch.tensor(y_test.values, dtype=torch.long)    # Changed to long (int64)

# Debugging dataset
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"y_train min: {y_train.min()}, y_train max: {y_train.max()}")
print(f"y_test min: {y_test.min()}, y_test max: {y_test.max()}")

# Create DataLoader for batching
batch_size = 1024
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

class WaterLevelModel(nn.Module):
    def __init__(self, input_dim, num_classes=5):
        super(WaterLevelModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),  # Increased number of neurons
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)  # Output layer with 'num_classes' classes
        )
    
    def forward(self, x):
        return self.model(x)


# Create the model
input_dim = X_train.shape[1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WaterLevelModel(input_dim).to(device)

# Debugging model
print(f"Model architecture:\n{model}")
print(f"Device being used: {device}")

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()  # This expects target labels as integers
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

 
        # Forward pass
        outputs = model(X_batch)

  

        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}")

# Evaluate the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch)

        loss = criterion(outputs, y_batch)
        test_loss += loss.item()

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


X_train shape: torch.Size([2264252, 50]), X_test shape: torch.Size([566064, 50])
y_train shape: torch.Size([2264252]), y_test shape: torch.Size([566064])
y_train min: 0, y_train max: 4
y_test min: 0, y_test max: 4
Model architecture:
WaterLevelModel(
  (model): Sequential(
    (0): Linear(in_features=50, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=64, out_features=32, bias=True)
    (10): ReLU()
    (11): Linear(in_features=32, out_features=5, bias=True)
  )
)
Device being used: cuda


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model.eval()


all_preds = []
all_labels = []


with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        outputs = model(X_batch)
        
        
        preds = torch.argmax(outputs, dim=1)
        all_preds.append(preds.cpu())
        all_labels.append(y_batch.cpu())


all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix
conf_mat = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_mat)

# Classification Report
class_report = classification_report(all_labels, all_preds)
print("Classification Report:")
print(class_report)

Accuracy: 0.4912
Confusion Matrix:
[[62262 27691 13230  3148  1140]
 [21312 55303 38445  8835  2032]
 [ 7303 26400 67345 24637  4465]
 [ 3418  8032 39203 49984 15482]
 [ 1948  2939 13146 25196 43168]]
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61    107471
           1       0.46      0.44      0.45    125927
           2       0.39      0.52      0.45    130150
           3       0.45      0.43      0.44    116119
           4       0.65      0.50      0.57     86397

    accuracy                           0.49    566064
   macro avg       0.52      0.49      0.50    566064
weighted avg       0.51      0.49      0.49    566064



In [None]:
import pandas as pd
import torch

test_df = pd.read_csv('complete_test_mean.csv')


continuous_columns = [col for col in test_df.columns if test_df[col].dtype in ['float64', 'float32']]
categorical_columns = [col for col in test_df.columns if test_df[col].dtype == 'object' or test_df[col].dtype.name == 'category']
discrete_columns = [col for col in test_df.columns if test_df[col].dtype in ['int64', 'int32'] and len(test_df[col].unique()) < 20]


categorical_columns += discrete_columns


scaler = StandardScaler()  
X_continuous_scaled = scaler.fit_transform(test_df[continuous_columns])

X_continuous_scaled = pd.DataFrame(X_continuous_scaled, columns=continuous_columns)

X_eval_processed = pd.concat([X_continuous_scaled, test_df[categorical_columns].reset_index(drop=True)], axis=1)


X_eval_tensor = torch.tensor(X_eval_processed.values, dtype=torch.float32)

## Prediction

In [None]:

model.eval()


X_eval_tensor = X_eval_tensor.to(device)


with torch.no_grad():
    outputs = model(X_eval_tensor)
    _, predictions = torch.max(outputs, 1)  # Get predicted class indices


predictions_np = predictions.cpu().numpy()

print(predictions_np)

[0 2 2 ... 0 2 0]


## Pipeline

In [9]:
predictions_np = rfc.predict(X_eval_processed)
predictions_np

array([1, 1, 2, ..., 1, 3, 2])

In [8]:
pd.DataFrame(predictions_np).value_counts()

0
0    219583
1    166210
2    109742
3     75800
4     39873
Name: count, dtype: int64

In [10]:
encoding = {
    "Very Low": 0,
    "Low": 1,
    "Average": 2,
    "High": 3,
    "Very High": 4
    }

decoding = {v: k for k, v in encoding.items()}

In [12]:
y_pred_dec = np.array(list(map(lambda x: decoding[x], predictions_np)))
y_pred_dec

array(['Low', 'Low', 'Average', ..., 'Low', 'High', 'Average'],
      dtype='<U9')

In [13]:
y_df = pd.read_csv('y_test_submission_example_Hi5.csv')
y_df.head()

Unnamed: 0,row_index,piezo_groundwater_level_category
0,2331795,My prediction
1,2331796,My prediction
2,2331797,My prediction
3,2331798,My prediction
4,2331799,My prediction


In [14]:
y_df['piezo_groundwater_level_category'] = y_pred_dec
y_df.tail()

Unnamed: 0,row_index,piezo_groundwater_level_category
611203,3610818,Very Low
611204,3610819,Very Low
611205,3610820,Low
611206,3610821,High
611207,3610822,Average


In [15]:
y_df['piezo_groundwater_level_category'].value_counts()

piezo_groundwater_level_category
Very Low     237660
Low          160494
Average      105286
High          70378
Very High     37390
Name: count, dtype: int64

In [16]:
y_df.to_csv('attempt_9.csv', index=False)