### Data Loading and Cleaning

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [5]:
df = pd.read_csv('data.csv')
print(f"Dataset Shape: {df.shape}")
df.head()

Dataset Shape: (2051, 11)


Unnamed: 0,PAYS,Code,Ville,X,Y,Date,End_of_sampling,Duration(h.min),I_131_(Bq/m3),Cs_134_(Bq/m3),Cs_137_(Bq/m3)
0,SE,1,RISOE,12.07,55.7,86/04/27,24:00:00,24.0,1.0,0.0,0.24
1,SE,1,RISOE,12.07,55.7,86/04/28,24:00:00,24.0,0.0046,0.00054,0.00098
2,SE,1,RISOE,12.07,55.7,86/04/29,12:00,12.0,0.0147,0.0043,0.0074
3,SE,1,RISOE,12.07,55.7,86/04/29,24:00:00,12.0,0.00061,0.0,9e-05
4,SE,1,RISOE,12.07,55.7,86/04/30,24:00:00,24.0,0.00075,0.0001,0.00028


In [6]:
def clean_radionuclide(val):
    s = str(val).strip().upper()

    if s in ['L', 'N', '?']:
        return np.nan

    try:
        return float(s)
    except ValueError:
        return np.nan

def clean_duration(val):
    if pd.isna(val) or val == 99.99: # 99.99 an error code in data
        return np.nan

    hours = int(val)
    minutes = round((val - hours) * 100)

    if minutes < 60:
        return hours + (minutes / 60.0)
    else:
        return val

In [7]:
target_cols = ['I_131_(Bq/m3)', 'Cs_134_(Bq/m3)', 'Cs_137_(Bq/m3)']

for col in target_cols:
    clean_col_name = f"Clean_{col.split('_')[0]}_{col.split('_')[1]}"
    df[clean_col_name] = df[col].apply(clean_radionuclide)

In [8]:
df['Date_Obj'] = pd.to_datetime(df['Date'], format='%y/%m/%d')
df['Date_Str'] = df['Date_Obj'].dt.strftime('%Y-%m-%d')
df['Duration_Hours'] = df['Duration(h.min)'].apply(clean_duration)


In [9]:
print("Data Cleaning Complete.")
print(df[['Date_Str', 'Clean_I_131', 'Clean_Cs_137']].info())
df.head()

Data Cleaning Complete.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date_Str      2051 non-null   object 
 1   Clean_I_131   2009 non-null   float64
 2   Clean_Cs_137  1506 non-null   float64
dtypes: float64(2), object(1)
memory usage: 48.2+ KB
None


Unnamed: 0,PAYS,Code,Ville,X,Y,Date,End_of_sampling,Duration(h.min),I_131_(Bq/m3),Cs_134_(Bq/m3),Cs_137_(Bq/m3),Clean_I_131,Clean_Cs_134,Clean_Cs_137,Date_Obj,Date_Str,Duration_Hours
0,SE,1,RISOE,12.07,55.7,86/04/27,24:00:00,24.0,1.0,0.0,0.24,1.0,0.0,0.24,1986-04-27,1986-04-27,24.0
1,SE,1,RISOE,12.07,55.7,86/04/28,24:00:00,24.0,0.0046,0.00054,0.00098,0.0046,0.00054,0.00098,1986-04-28,1986-04-28,24.0
2,SE,1,RISOE,12.07,55.7,86/04/29,12:00,12.0,0.0147,0.0043,0.0074,0.0147,0.0043,0.0074,1986-04-29,1986-04-29,12.0
3,SE,1,RISOE,12.07,55.7,86/04/29,24:00:00,12.0,0.00061,0.0,9e-05,0.00061,0.0,9e-05,1986-04-29,1986-04-29,12.0
4,SE,1,RISOE,12.07,55.7,86/04/30,24:00:00,24.0,0.00075,0.0001,0.00028,0.00075,0.0001,0.00028,1986-04-30,1986-04-30,24.0


### Exploratory Data Analysis (EDA)