In [1]:
from pathlib import Path
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import warnings; warnings.filterwarnings("ignore")

try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path.cwd()

print("Notebook exécuté depuis :", BASE_DIR)

PROJECT_ROOT   = BASE_DIR.parent

PROC_DATA_DIR  = PROJECT_ROOT / "ProcessedData"
PROC_DATA_DIR2 = PROJECT_ROOT / "ProcessedData2" 
PROC_DATA_DIR2.mkdir(exist_ok=True)

print("PROC_DATA_DIR  :", PROC_DATA_DIR)
print("PROC_DATA_DIR2 :", PROC_DATA_DIR2)


Notebook exécuté depuis : /app/Model/ETL
PROC_DATA_DIR  : /app/Model/ProcessedData
PROC_DATA_DIR2 : /app/Model/ProcessedData2


In [2]:
# importation des bibliotheques necessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# configuration de l'affichage
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# chargement des donnees covid pays 
covid_data = pd.read_csv(PROC_DATA_DIR / 'covid_countries_clean.csv')
covid_data['date'] = pd.to_datetime(covid_data['date'])

# affichage des informations de base
print("dimensions du dataset:", covid_data.shape)
print("\ncolonnes disponibles:")
print(covid_data.columns.tolist())
print("\nplage de dates:", covid_data['date'].min(), "a", covid_data['date'].max())
print("\nnombre de pays uniques:", covid_data['location'].nunique())

# verification des valeurs manquantes pour les colonnes cles
colonnes_cles = ['new_cases', 'total_cases', 'new_deaths', 'total_deaths', 
                 'reproduction_rate', 'mortality_rate', 'population', 'stringency_index']
print("\ntaux de valeurs manquantes pour les colonnes cles:")
for col in colonnes_cles:
    if col in covid_data.columns:
        taux_nan = covid_data[col].isna().sum() / len(covid_data) * 100
        print(f"  {col}: {taux_nan:.1f}%")

dimensions du dataset: (421264, 22)

colonnes disponibles:
['iso_code', 'continent', 'location', 'date', 'new_cases', 'total_cases', 'new_cases_smoothed', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'reproduction_rate', 'new_deaths', 'total_deaths', 'new_deaths_smoothed', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'total_deaths_per_million', 'mortality_rate', 'population', 'population_density', 'stringency_index', 'latitude', 'longitude']

plage de dates: 2020-01-01 00:00:00 a 2024-08-14 00:00:00

nombre de pays uniques: 243

taux de valeurs manquantes pour les colonnes cles:
  new_cases: 3.0%
  total_cases: 2.6%
  new_deaths: 2.9%
  total_deaths: 2.6%
  reproduction_rate: 53.6%
  mortality_rate: 9.7%
  population: 0.0%
  stringency_index: 50.8%


In [3]:
# analyse de la distribution des new_cases
print("=== analyse de la distribution des new_cases ===")

# statistiques de base
print("\nstatistiques de base pour new_cases:")
print(covid_data['new_cases'].describe())

# analyse des zeros
total_rows = len(covid_data)
zero_cases = (covid_data['new_cases'] == 0).sum()
nan_cases = covid_data['new_cases'].isna().sum()
positive_cases = (covid_data['new_cases'] > 0).sum()

print(f"\nrepartition des valeurs:")
print(f"  valeurs zero: {zero_cases} ({zero_cases/total_rows*100:.1f}%)")
print(f"  valeurs nan: {nan_cases} ({nan_cases/total_rows*100:.1f}%)")
print(f"  valeurs positives: {positive_cases} ({positive_cases/total_rows*100:.1f}%)")

# analyse par jour de la semaine
covid_data['day_of_week'] = covid_data['date'].dt.day_name()
covid_data['week_number'] = covid_data['date'].dt.isocalendar().week

# moyenne des nouveaux cas par jour de la semaine
print("\n=== analyse par jour de la semaine ===")
day_analysis = covid_data.groupby('day_of_week').agg({
    'new_cases': ['mean', 'median', 'count', lambda x: (x == 0).sum()]
}).round(2)
day_analysis.columns = ['moyenne', 'mediane', 'nombre_observations', 'nombre_zeros']
day_analysis['pct_zeros'] = (day_analysis['nombre_zeros'] / day_analysis['nombre_observations'] * 100).round(1)

# reordonner les jours
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_analysis = day_analysis.reindex(day_order)
print(day_analysis)


=== analyse de la distribution des new_cases ===

statistiques de base pour new_cases:
count     408485.000
mean        2080.297
std        84712.729
min            0.000
25%            0.000
50%            0.000
75%            0.000
max     40475477.000
Name: new_cases, dtype: float64

repartition des valeurs:
  valeurs zero: 367445 (87.2%)
  valeurs nan: 12779 (3.0%)
  valeurs positives: 41040 (9.7%)

=== analyse par jour de la semaine ===
              moyenne  mediane  nombre_observations  nombre_zeros  pct_zeros
day_of_week                                                                 
Monday          0.000    0.000                58323         58323    100.000
Tuesday         0.000    0.000                58323         58323    100.000
Wednesday       0.000    0.000                58323         58323    100.000
Thursday        0.000    0.000                58323         58323    100.000
Friday          0.000    0.000                58323         58323    100.000
Saturday       

In [4]:
# analyse du pattern de rapport par pays
print("=== analyse du pattern de rapport par pays ===")

# calculer le pourcentage de zeros par jour de la semaine pour chaque pays
country_patterns = []
for country in covid_data['location'].unique():
    country_data = covid_data[covid_data['location'] == country].copy()
    
    # compter les zeros par jour de la semaine
    pattern = country_data.groupby('day_of_week')['new_cases'].apply(lambda x: (x == 0).mean() * 100)
    pattern = pattern.reindex(day_order)
    
    # identifier le jour de rapport principal (jour avec le moins de zeros)
    report_day = pattern.idxmin()
    min_zero_pct = pattern.min()
    
    country_patterns.append({
        'country': country,
        'report_day': report_day,
        'report_day_zero_pct': min_zero_pct,
        'autres_jours_zero_pct': pattern[pattern.index != report_day].mean()
    })

patterns_df = pd.DataFrame(country_patterns)
print("\npatterns de rapport pour 20 pays (echantillon):")
print(patterns_df)

# distribution des jours de rapport
print("\n=== distribution des jours de rapport principaux ===")
report_day_dist = patterns_df['report_day'].value_counts()
print(report_day_dist)

# analyser la regularite temporelle des rapports pour un pays exemple
print("\n=== analyse de la regularite temporelle (exemple: France) ===")
france_data = covid_data[covid_data['location'] == 'France'].copy()
france_data = france_data.sort_values('date')

# identifier les jours avec des rapports
france_reports = france_data[france_data['new_cases'] > 0][['date', 'new_cases', 'day_of_week']]
france_reports['days_since_last'] = france_reports['date'].diff().dt.days

print(f"nombre de rapports pour la France: {len(france_reports)}")
print(f"intervalle moyen entre rapports: {france_reports['days_since_last'].mean():.1f} jours")
print(f"ecart-type de l'intervalle: {france_reports['days_since_last'].std():.1f} jours")
print("\ndistribution des intervalles entre rapports:")
print(france_reports['days_since_last'].value_counts().head(10))

=== analyse du pattern de rapport par pays ===

patterns de rapport pour 20 pays (echantillon):
               country report_day  report_day_zero_pct  autres_jours_zero_pct
0          Afghanistan     Sunday                9.167                100.000
1              Albania     Sunday               21.250                100.000
2              Algeria     Sunday               13.333                100.000
3       American Samoa     Sunday               75.833                100.000
4              Andorra     Sunday               34.167                100.000
..                 ...        ...                  ...                    ...
238  Wallis and Futuna     Sunday               90.000                100.000
239     Western Sahara  Wednesday                0.000                    NaN
240              Yemen     Sunday               49.167                100.000
241             Zambia     Sunday               19.583                100.000
242           Zimbabwe     Sunday             

In [5]:
# analyse de la completude des donnees hebdomadaires
print("=== analyse de la completude des donnees hebdomadaires ===")

# creer des agregations hebdomadaires
covid_data['year_week'] = covid_data['date'].dt.strftime('%Y-%W')

# analyser la completude par semaine pour un echantillon de pays
sample_countries = ['France', 'Germany', 'Italy', 'Spain', 'United States']
completeness_analysis = []

for country in sample_countries:
    country_data = covid_data[covid_data['location'] == country]
    
    # nombre de semaines avec au moins un rapport
    weeks_with_data = country_data[country_data['new_cases'] > 0].groupby('year_week').size()
    total_weeks = country_data['year_week'].nunique()
    weeks_with_reports = len(weeks_with_data)
    
    # verifier la coherence entre new_cases hebdomadaires et differences de total_cases
    weekly_data = country_data.groupby('year_week').agg({
        'new_cases': 'sum',
        'total_cases': ['first', 'last'],
        'date': ['min', 'max']
    })
    weekly_data.columns = ['new_cases_sum', 'total_cases_start', 'total_cases_end', 'date_start', 'date_end']
    weekly_data['total_cases_diff'] = weekly_data['total_cases_end'] - weekly_data['total_cases_start']
    
    # calculer la correlation entre somme hebdomadaire et difference de total_cases
    valid_weeks = weekly_data.dropna()
    if len(valid_weeks) > 0:
        correlation = valid_weeks['new_cases_sum'].corr(valid_weeks['total_cases_diff'])
    else:
        correlation = np.nan
    
    completeness_analysis.append({
        'country': country,
        'total_weeks': total_weeks,
        'weeks_with_reports': weeks_with_reports,
        'completeness_pct': weeks_with_reports / total_weeks * 100,
        'correlation_new_vs_total': correlation
    })

completeness_df = pd.DataFrame(completeness_analysis)
print(completeness_df)

# exemple detaille pour la France
print("\n=== exemple detaille pour la France (10 premieres semaines avec donnees) ===")
france_weekly = covid_data[covid_data['location'] == 'France'].groupby('year_week').agg({
    'new_cases': 'sum',
    'new_deaths': 'sum',
    'total_cases': ['first', 'last'],
    'total_deaths': ['first', 'last'],
    'date': ['min', 'max']
})
france_weekly.columns = ['new_cases_sum', 'new_deaths_sum', 'total_cases_start', 'total_cases_end', 
                        'total_deaths_start', 'total_deaths_end', 'date_start', 'date_end']
france_weekly['cases_diff'] = france_weekly['total_cases_end'] - france_weekly['total_cases_start']
france_weekly['deaths_diff'] = france_weekly['total_deaths_end'] - france_weekly['total_deaths_start']

# afficher les premieres semaines avec des cas
france_with_cases = france_weekly[france_weekly['new_cases_sum'] > 0].head(10)
print(france_with_cases[['date_start', 'date_end', 'new_cases_sum', 'cases_diff', 'new_deaths_sum', 'deaths_diff']])

# verifier la coherence
print("\n=== verification de la coherence des donnees ===")
discrepancies = france_weekly[(france_weekly['new_cases_sum'] != france_weekly['cases_diff']) & 
                              (france_weekly['new_cases_sum'] > 0)]
print(f"nombre de semaines avec incoherence entre new_cases et total_cases: {len(discrepancies)}")
if len(discrepancies) > 0:
    print("exemples d'incoherences:")
    print(discrepancies[['date_start', 'new_cases_sum', 'cases_diff']].head())

=== analyse de la completude des donnees hebdomadaires ===
         country  total_weeks  weeks_with_reports  completeness_pct  \
0         France          243                 167            68.724   
1        Germany          243                 181            74.486   
2          Italy          244                 234            95.902   
3          Spain          243                 181            74.486   
4  United States          243                 173            71.193   

   correlation_new_vs_total  
0                     1.000  
1                     0.999  
2                     0.999  
3                     1.000  
4                     0.999  

=== exemple detaille pour la France (10 premieres semaines avec donnees) ===
          date_start   date_end  new_cases_sum  cases_diff  new_deaths_sum  \
year_week                                                                    
2020-03   2020-01-20 2020-01-26          3.000       3.000           0.000   
2020-04   2020-01-27 2

In [6]:
# strategie d'agregation hebdomadaire
print("=== creation des donnees hebdomadaires agregees ===")

def create_weekly_aggregates(df):
    """
    agregation hebdomadaire des donnees covid avec calcul des metriques cibles
    """
    # trier par location et date
    df = df.sort_values(['location', 'date'])
    
    # creer l'identifiant de semaine iso
    df['year_week'] = df['date'].dt.isocalendar().year.astype(str) + '-' + \
                      df['date'].dt.isocalendar().week.astype(str).str.zfill(2)
    
    # agregations hebdomadaires
    weekly_agg = df.groupby(['location', 'year_week']).agg({
        # dates
        'date': ['min', 'max'],
        
        # cas et deces
        'new_cases': 'sum',
        'new_deaths': 'sum',
        'total_cases': ['first', 'last'],
        'total_deaths': ['first', 'last'],
        
        # donnees lissees
        'new_cases_smoothed': 'mean',
        'new_deaths_smoothed': 'mean',
        
        # metriques par million
        'new_cases_per_million': 'mean',
        'new_deaths_per_million': 'mean',
        
        # autres metriques
        'reproduction_rate': 'mean',
        'mortality_rate': 'last',
        'stringency_index': 'mean',
        
        # donnees demographiques (constantes)
        'population': 'first',
        'population_density': 'first',
        'continent': 'first',
        'iso_code': 'first',
        'latitude': 'first',
        'longitude': 'first'
    })
    
    # aplatir les noms de colonnes
    weekly_agg.columns = ['_'.join(col).strip() for col in weekly_agg.columns.values]
    weekly_agg = weekly_agg.reset_index()
    
    # renommer les colonnes pour plus de clarte
    column_mapping = {
        'date_min': 'week_start',
        'date_max': 'week_end',
        'new_cases_sum': 'weekly_new_cases',
        'new_deaths_sum': 'weekly_new_deaths',
        'total_cases_first': 'total_cases_start',
        'total_cases_last': 'total_cases_end',
        'total_deaths_first': 'total_deaths_start',
        'total_deaths_last': 'total_deaths_end',
        'new_cases_smoothed_mean': 'avg_daily_cases_smoothed',
        'new_deaths_smoothed_mean': 'avg_daily_deaths_smoothed',
        'new_cases_per_million_mean': 'avg_cases_per_million',
        'new_deaths_per_million_mean': 'avg_deaths_per_million',
        'reproduction_rate_mean': 'avg_reproduction_rate',
        'mortality_rate_last': 'mortality_rate',
        'stringency_index_mean': 'avg_stringency_index',
        'population_first': 'population',
        'population_density_first': 'population_density',
        'continent_first': 'continent',
        'iso_code_first': 'iso_code',
        'latitude_first': 'latitude',
        'longitude_first': 'longitude'
    }
    weekly_agg = weekly_agg.rename(columns=column_mapping)
    
    return weekly_agg

# appliquer l'agregation
covid_weekly = create_weekly_aggregates(covid_data)

print(f"dimensions des donnees hebdomadaires: {covid_weekly.shape}")
print(f"nombre de semaines uniques: {covid_weekly['year_week'].nunique()}")
print(f"plage de semaines: {covid_weekly['year_week'].min()} a {covid_weekly['year_week'].max()}")

# verification de la qualite
print("\n=== qualite des donnees hebdomadaires ===")
print("nombre de lignes avec weekly_new_cases > 0:", (covid_weekly['weekly_new_cases'] > 0).sum())
print("pourcentage de donnees utiles:", (covid_weekly['weekly_new_cases'] > 0).sum() / len(covid_weekly) * 100, "%")

# exemple de donnees hebdomadaires
print("\n=== exemple de donnees hebdomadaires (France, 5 premieres semaines avec cas) ===")
france_weekly = covid_weekly[(covid_weekly['location'] == 'France') & 
                            (covid_weekly['weekly_new_cases'] > 0)].head()
cols_to_show = ['location', 'year_week', 'week_start', 'weekly_new_cases', 'weekly_new_deaths', 
                'avg_reproduction_rate', 'mortality_rate', 'avg_stringency_index']
print(france_weekly[cols_to_show])

=== creation des donnees hebdomadaires agregees ===
dimensions des donnees hebdomadaires: (57569, 23)
nombre de semaines uniques: 242
plage de semaines: 2020-01 a 2024-33

=== qualite des donnees hebdomadaires ===
nombre de lignes avec weekly_new_cases > 0: 38929
pourcentage de donnees utiles: 67.62146294012402 %

=== exemple de donnees hebdomadaires (France, 5 premieres semaines avec cas) ===
      location year_week week_start  weekly_new_cases  weekly_new_deaths  \
17482   France   2020-04 2020-01-20             3.000              0.000   
17483   France   2020-05 2020-01-27             3.000              0.000   
17484   France   2020-06 2020-02-03             6.000              0.000   
17486   France   2020-08 2020-02-17             4.000              0.000   
17498   France   2020-20 2020-05-11           677.000            810.000   

       avg_reproduction_rate  mortality_rate  avg_stringency_index  
17482                    NaN           0.000                 3.177  
17483   

In [7]:
# verification complete des donnees covid
print("verification complete des donnees covid")
print("=" * 50)

# 1. verification des dimensions et structure
print("\n1. structure des donnees")
print(f"   dimensions: {covid_data.shape}")
print(f"   nombre de pays: {covid_data['location'].nunique()}")
print(f"   plage temporelle: {covid_data['date'].min()} a {covid_data['date'].max()}")
print(f"   nombre de jours: {(covid_data['date'].max() - covid_data['date'].min()).days + 1}")

# 2. verification de la coherence temporelle
print("\n2. coherence temporelle")
expected_rows = covid_data['location'].nunique() * ((covid_data['date'].max() - covid_data['date'].min()).days + 1)
print(f"   nombre de lignes attendues: {expected_rows}")
print(f"   nombre de lignes reelles: {len(covid_data)}")
print(f"   lignes manquantes: {expected_rows - len(covid_data)}")

# identifier les dates manquantes par pays
missing_dates = []
for country in covid_data['location'].unique():
    country_data = covid_data[covid_data['location'] == country]
    date_range = pd.date_range(start=covid_data['date'].min(), end=covid_data['date'].max())
    existing_dates = set(country_data['date'])
    missing = set(date_range) - existing_dates
    if missing:
        missing_dates.append({'country': country, 'missing_count': len(missing)})

if missing_dates:
    missing_df = pd.DataFrame(missing_dates).sort_values('missing_count', ascending=False)
    print(f"\n   pays avec dates manquantes: {len(missing_df)}")
    print("   top 5 pays avec le plus de dates manquantes:")
    print(missing_df.head().to_string(index=False))

# 3. verification de la coherence des donnees cumulatives
print("\n3. coherence des donnees cumulatives")
coherence_issues = []

for country in covid_data['location'].unique():
    country_data = covid_data[covid_data['location'] == country].sort_values('date')
    
    # verifier que total_cases est croissant
    cases_decreases = (country_data['total_cases'].diff() < 0).sum()
    deaths_decreases = (country_data['total_deaths'].diff() < 0).sum()
    
    # verifier la coherence entre new_cases et total_cases
    country_data['calculated_new_cases'] = country_data['total_cases'].diff()
    discrepancies = country_data[
        (country_data['new_cases'].notna()) & 
        (country_data['calculated_new_cases'].notna()) &
        (abs(country_data['new_cases'] - country_data['calculated_new_cases']) > 1)
    ]
    
    if cases_decreases > 0 or deaths_decreases > 0 or len(discrepancies) > 0:
        coherence_issues.append({
            'country': country,
            'cases_decreases': cases_decreases,
            'deaths_decreases': deaths_decreases,
            'new_vs_total_discrepancies': len(discrepancies)
        })

coherence_df = pd.DataFrame(coherence_issues)
print(f"   pays avec problemes de coherence: {len(coherence_df)}")
if len(coherence_df) > 0:
    print("   top 10 pays avec le plus de problemes:")
    coherence_df['total_issues'] = coherence_df['cases_decreases'] + coherence_df['deaths_decreases'] + coherence_df['new_vs_total_discrepancies']
    print(coherence_df.sort_values('total_issues', ascending=False).head(10).to_string(index=False))

# 4. verification des valeurs aberrantes
print("\n4. detection des valeurs aberrantes")

# valeurs negatives
negative_cases = (covid_data['new_cases'] < 0).sum()
negative_deaths = (covid_data['new_deaths'] < 0).sum()
print(f"   new_cases negatifs: {negative_cases}")
print(f"   new_deaths negatifs: {negative_deaths}")

# valeurs extremes (plus de 3 ecarts-types)
for col in ['new_cases', 'new_deaths']:
    if col in covid_data.columns:
        data = covid_data[covid_data[col].notna() & (covid_data[col] > 0)][col]
        if len(data) > 0:
            mean = data.mean()
            std = data.std()
            threshold = mean + 3 * std
            extremes = (covid_data[col] > threshold).sum()
            print(f"   {col} extremes (>3 std): {extremes} valeurs")
            
            # afficher les 5 plus grandes valeurs
            top_values = covid_data.nlargest(5, col)[['location', 'date', col]]
            print(f"   top 5 {col}:")
            for _, row in top_values.iterrows():
                print(f"      {row['location']} ({row['date'].strftime('%Y-%m-%d')}): {row[col]:,.0f}")

# 5. verification des taux de valeurs manquantes
print("\n5. analyse des valeurs manquantes")
missing_summary = []
for col in covid_data.columns:
    missing_count = covid_data[col].isna().sum()
    missing_pct = missing_count / len(covid_data) * 100
    if missing_pct > 0:
        missing_summary.append({
            'column': col,
            'missing_count': missing_count,
            'missing_pct': round(missing_pct, 2)
        })

missing_df = pd.DataFrame(missing_summary).sort_values('missing_pct', ascending=False)
print("   colonnes avec valeurs manquantes:")
print(missing_df.to_string(index=False))

# 6. verification des patterns de rapport hebdomadaire
print("\n6. analyse des patterns de rapport")
sunday_reporting = covid_data[covid_data['day_of_week'] == 'Sunday']['new_cases']
other_days_reporting = covid_data[covid_data['day_of_week'] != 'Sunday']['new_cases']

print(f"   dimanche - cas > 0: {(sunday_reporting > 0).sum()} ({(sunday_reporting > 0).sum() / len(sunday_reporting) * 100:.1f}%)")
print(f"   autres jours - cas > 0: {(other_days_reporting > 0).sum()} ({(other_days_reporting > 0).sum() / len(other_days_reporting) * 100:.1f}%)")

# 7. verification de la coherence des metadonnees
print("\n7. coherence des metadonnees")
metadata_issues = []

for country in covid_data['location'].unique():
    country_data = covid_data[covid_data['location'] == country]
    
    # verifier que les metadonnees sont constantes
    unique_populations = country_data['population'].nunique()
    unique_densities = country_data['population_density'].nunique()
    unique_continents = country_data['continent'].nunique()
    
    if unique_populations > 1 or unique_densities > 1 or unique_continents > 1:
        metadata_issues.append({
            'country': country,
            'unique_populations': unique_populations,
            'unique_densities': unique_densities,
            'unique_continents': unique_continents
        })

if metadata_issues:
    print(f"   pays avec metadonnees incoherentes: {len(metadata_issues)}")
    print("   exemples:")
    for issue in metadata_issues[:5]:
        print(f"      {issue}")

# resume final
print("\n" + "=" * 50)
print("resume de la verification")
print("=" * 50)
print(f"donnees analysees: {len(covid_data):,} lignes")
print(f"problemes detectes:")
print(f"  - dates manquantes: {'oui' if missing_dates else 'non'}")
print(f"  - incoherences cumulatives: {'oui' if coherence_issues else 'non'}")
print(f"  - valeurs negatives: {'oui' if negative_cases > 0 or negative_deaths > 0 else 'non'}")
print(f"  - metadonnees incoherentes: {'oui' if metadata_issues else 'non'}")
print(f"  - pattern dimanche confirme: oui")

verification complete des donnees covid

1. structure des donnees
   dimensions: (421264, 25)
   nombre de pays: 243
   plage temporelle: 2020-01-01 00:00:00 a 2024-08-14 00:00:00
   nombre de jours: 1688

2. coherence temporelle
   nombre de lignes attendues: 410184
   nombre de lignes reelles: 421264
   lignes manquantes: -11080

   pays avec dates manquantes: 243
   top 5 pays avec le plus de dates manquantes:
        country  missing_count
 Western Sahara           1687
Northern Cyprus            997
          Macao            893
          Wales            490
       Scotland            383

3. coherence des donnees cumulatives
   pays avec problemes de coherence: 24
   top 10 pays avec le plus de problemes:
                         country  cases_decreases  deaths_decreases  new_vs_total_discrepancies  total_issues
Saint Vincent and the Grenadines                0                 0                         290           290
                          Russia                0        

In [8]:
# correction et preparation des donnees pour agregation hebdomadaire
print("correction et preparation des donnees")
print("=" * 50)

# 1. filtrer les pays avec suffisamment de donnees
print("\n1. filtrage des pays avec donnees insuffisantes")
min_data_points = 100  # au moins 100 jours avec des donnees
countries_data_count = covid_data.groupby('location')['new_cases'].count()
valid_countries = countries_data_count[countries_data_count >= min_data_points].index.tolist()

print(f"   pays initiaux: {covid_data['location'].nunique()}")
print(f"   pays avec >= {min_data_points} points de donnees: {len(valid_countries)}")

# filtrer le dataset
covid_filtered = covid_data[covid_data['location'].isin(valid_countries)].copy()
print(f"   lignes apres filtrage: {len(covid_filtered)} ({len(covid_filtered)/len(covid_data)*100:.1f}%)")

# 2. corriger les incoherences entre new_cases et total_cases
print("\n2. correction des incoherences new_cases vs total_cases")
corrections_made = 0

for country in covid_filtered['location'].unique():
    country_idx = covid_filtered['location'] == country
    country_data = covid_filtered[country_idx].sort_values('date')
    
    # calculer new_cases a partir de total_cases quand possible
    calculated_new = country_data['total_cases'].diff()
    
    # remplacer new_cases par la difference de total_cases si incoherent
    mask = (
        country_data['new_cases'].notna() & 
        calculated_new.notna() & 
        (abs(country_data['new_cases'] - calculated_new) > 1)
    )
    
    if mask.any():
        covid_filtered.loc[country_data[mask].index, 'new_cases'] = calculated_new[mask]
        corrections_made += mask.sum()

print(f"   corrections effectuees: {corrections_made}")

# 3. gerer les valeurs extremes
print("\n3. gestion des valeurs extremes")
# identifier les seuils raisonnables (99.9 percentile)
threshold_cases = covid_filtered[covid_filtered['new_cases'] > 0]['new_cases'].quantile(0.999)
threshold_deaths = covid_filtered[covid_filtered['new_deaths'] > 0]['new_deaths'].quantile(0.999)

print(f"   seuil new_cases (99.9%): {threshold_cases:,.0f}")
print(f"   seuil new_deaths (99.9%): {threshold_deaths:,.0f}")

# marquer les valeurs extremes sans les supprimer
covid_filtered['extreme_cases'] = covid_filtered['new_cases'] > threshold_cases
covid_filtered['extreme_deaths'] = covid_filtered['new_deaths'] > threshold_deaths

extreme_cases_count = covid_filtered['extreme_cases'].sum()
extreme_deaths_count = covid_filtered['extreme_deaths'].sum()
print(f"   valeurs extremes marquees - cases: {extreme_cases_count}, deaths: {extreme_deaths_count}")

# 4. preparer l'agregation hebdomadaire
print("\n4. preparation pour agregation hebdomadaire")

# ajouter les informations temporelles necessaires
covid_filtered['year'] = covid_filtered['date'].dt.year
covid_filtered['week'] = covid_filtered['date'].dt.isocalendar().week
covid_filtered['year_week'] = (
    covid_filtered['year'].astype(str) + '-' + 
    covid_filtered['week'].astype(str).str.zfill(2)
)

# identifier le debut et fin de chaque semaine
covid_filtered['week_start'] = covid_filtered['date'] - pd.to_timedelta(covid_filtered['date'].dt.dayofweek, unit='d')
covid_filtered['week_end'] = covid_filtered['week_start'] + pd.Timedelta(days=6)

# 5. verifier la qualite des donnees preparees
print("\n5. verification finale")
print(f"   donnees finales: {len(covid_filtered):,} lignes")
print(f"   pays: {covid_filtered['location'].nunique()}")
print(f"   plage temporelle: {covid_filtered['date'].min()} a {covid_filtered['date'].max()}")
print(f"   semaines uniques: {covid_filtered['year_week'].nunique()}")

# verifier qu'on a bien des donnees pour les dimanches
sunday_data = covid_filtered[covid_filtered['date'].dt.dayofweek == 6]
sunday_with_cases = sunday_data[sunday_data['new_cases'] > 0]
print(f"   dimanches avec donnees: {len(sunday_with_cases)} / {len(sunday_data)} ({len(sunday_with_cases)/len(sunday_data)*100:.1f}%)")

# statistiques sur les colonnes cles
print("\n   completude des colonnes essentielles:")
essential_cols = ['new_cases', 'new_deaths', 'total_cases', 'total_deaths', 'population']
for col in essential_cols:
    completeness = (covid_filtered[col].notna().sum() / len(covid_filtered) * 100)
    print(f"     {col}: {completeness:.1f}%")

print("\ndonnees pretes pour agregation hebdomadaire")

correction et preparation des donnees

1. filtrage des pays avec donnees insuffisantes
   pays initiaux: 243
   pays avec >= 100 points de donnees: 234
   lignes apres filtrage: 410193 (97.4%)

2. correction des incoherences new_cases vs total_cases
   corrections effectuees: 2086

3. gestion des valeurs extremes
   seuil new_cases (99.9%): 1,430,714
   seuil new_deaths (99.9%): 17,246
   valeurs extremes marquees - cases: 39, deaths: 27

4. preparation pour agregation hebdomadaire

5. verification finale
   donnees finales: 410,193 lignes
   pays: 234
   plage temporelle: 2020-01-01 00:00:00 a 2024-08-14 00:00:00
   semaines uniques: 243
   dimanches avec donnees: 38965 / 58806 (66.3%)

   completude des colonnes essentielles:
     new_cases: 99.6%
     new_deaths: 99.7%
     total_cases: 100.0%
     total_deaths: 100.0%
     population: 100.0%

donnees pretes pour agregation hebdomadaire


In [9]:
# agregation hebdomadaire des donnees covid
print("agregation hebdomadaire des donnees")
print("=" * 50)

# fonction d'agregation hebdomadaire
def aggregate_weekly_data(df):
    """
    agregation hebdomadaire avec calcul des metriques cles
    """
    # grouper par pays et semaine
    weekly_agg = df.groupby(['location', 'year_week', 'week_start', 'week_end']).agg({
        # donnees cumulatives - prendre la derniere valeur de la semaine
        'total_cases': 'last',
        'total_deaths': 'last',
        
        # sommes hebdomadaires
        'new_cases': 'sum',
        'new_deaths': 'sum',
        
        # moyennes des donnees lissees
        'new_cases_smoothed': 'mean',
        'new_deaths_smoothed': 'mean',
        
        # moyennes des taux
        'new_cases_per_million': 'mean',
        'new_deaths_per_million': 'mean',
        'total_deaths_per_million': 'last',
        
        # autres metriques - moyennes
        'reproduction_rate': 'mean',
        'mortality_rate': 'mean',
        'stringency_index': 'mean',
        
        # metadonnees constantes - prendre la premiere valeur
        'population': 'first',
        'population_density': 'first',
        'continent': 'first',
        'iso_code': 'first',
        'latitude': 'first',
        'longitude': 'first',
        
        # marqueurs de qualite
        'extreme_cases': 'any',
        'extreme_deaths': 'any',
        
        # nombre de jours avec donnees dans la semaine
        'date': 'count'
    }).reset_index()
    
    # renommer les colonnes
    weekly_agg = weekly_agg.rename(columns={
        'date': 'days_with_data',
        'new_cases': 'weekly_cases',
        'new_deaths': 'weekly_deaths',
        'new_cases_smoothed': 'avg_daily_cases_smoothed',
        'new_deaths_smoothed': 'avg_daily_deaths_smoothed',
        'new_cases_per_million': 'avg_cases_per_million',
        'new_deaths_per_million': 'avg_deaths_per_million',
        'reproduction_rate': 'avg_reproduction_rate',
        'mortality_rate': 'avg_mortality_rate',
        'stringency_index': 'avg_stringency_index'
    })
    
    return weekly_agg

# executer l'agregation
print("\nexecution de l'agregation...")
covid_weekly = aggregate_weekly_data(covid_filtered)

# calculer les differences hebdomadaires des totaux cumules
print("\ncalcul des differences hebdomadaires...")
covid_weekly = covid_weekly.sort_values(['location', 'week_start'])
covid_weekly['weekly_cases_from_total'] = covid_weekly.groupby('location')['total_cases'].diff()
covid_weekly['weekly_deaths_from_total'] = covid_weekly.groupby('location')['total_deaths'].diff()

# calculer le taux de croissance hebdomadaire
covid_weekly['cases_growth_rate'] = covid_weekly.groupby('location')['weekly_cases'].pct_change()
covid_weekly['deaths_growth_rate'] = covid_weekly.groupby('location')['weekly_deaths'].pct_change()

# ajouter des indicateurs de qualite et gestion des valeurs extremes
covid_weekly['data_quality'] = 'good'
covid_weekly.loc[covid_weekly['days_with_data'] < 3, 'data_quality'] = 'incomplete'
covid_weekly.loc[covid_weekly['extreme_cases'] | covid_weekly['extreme_deaths'], 'data_quality'] = 'extreme_values'

# creer des versions ajustees des donnees pour la regression
# option 1: cap les valeurs extremes au 99.9 percentile
cap_cases = covid_weekly[covid_weekly['weekly_cases'] > 0]['weekly_cases'].quantile(0.999)
cap_deaths = covid_weekly[covid_weekly['weekly_deaths'] > 0]['weekly_deaths'].quantile(0.999)

covid_weekly['weekly_cases_capped'] = covid_weekly['weekly_cases'].clip(upper=cap_cases)
covid_weekly['weekly_deaths_capped'] = covid_weekly['weekly_deaths'].clip(upper=cap_deaths)

# option 2: transformation logarithmique pour reduire l'impact des extremes
covid_weekly['log_weekly_cases'] = np.log1p(covid_weekly['weekly_cases'])
covid_weekly['log_weekly_deaths'] = np.log1p(covid_weekly['weekly_deaths'])

# option 3: z-score pour identifier les outliers statistiques
def calculate_robust_zscore(series):
    """calcul du z-score robuste base sur la mediane et mad"""
    median = series.median()
    mad = (series - median).abs().median()
    if mad == 0:
        return pd.Series(0, index=series.index)
    return (series - median) / (1.4826 * mad)

covid_weekly['zscore_cases'] = covid_weekly.groupby('location')['weekly_cases'].transform(calculate_robust_zscore)
covid_weekly['zscore_deaths'] = covid_weekly.groupby('location')['weekly_deaths'].transform(calculate_robust_zscore)

# marquer les outliers statistiques (zscore > 3)
covid_weekly['is_outlier'] = (abs(covid_weekly['zscore_cases']) > 3) | (abs(covid_weekly['zscore_deaths']) > 3)

# creer un poids pour la regression (downweight les outliers)
covid_weekly['regression_weight'] = 1.0
covid_weekly.loc[covid_weekly['data_quality'] == 'extreme_values', 'regression_weight'] = 0.1
covid_weekly.loc[covid_weekly['data_quality'] == 'incomplete', 'regression_weight'] = 0.5
covid_weekly.loc[covid_weekly['is_outlier'], 'regression_weight'] = 0.2

# statistiques de l'agregation
print("\n" + "=" * 50)
print("resultats de l'agregation hebdomadaire")
print("=" * 50)
print(f"nombre total de lignes: {len(covid_weekly):,}")
print(f"nombre de pays: {covid_weekly['location'].nunique()}")
print(f"nombre de semaines: {covid_weekly['year_week'].nunique()}")
print(f"plage temporelle: {covid_weekly['week_start'].min()} a {covid_weekly['week_end'].max()}")

# qualite des donnees
print("\nqualite des donnees hebdomadaires:")
quality_dist = covid_weekly['data_quality'].value_counts()
for quality, count in quality_dist.items():
    print(f"  {quality}: {count:,} ({count/len(covid_weekly)*100:.1f}%)")

# statistiques des cas hebdomadaires
print("\nstatistiques des cas hebdomadaires:")
weekly_cases_stats = covid_weekly[covid_weekly['weekly_cases'] > 0]['weekly_cases'].describe()
print(f"  moyenne: {weekly_cases_stats['mean']:,.0f}")
print(f"  mediane: {weekly_cases_stats['50%']:,.0f}")
print(f"  max: {weekly_cases_stats['max']:,.0f}")

# verifier la coherence entre sommes et differences
print("\nverification de coherence (weekly_cases vs differences de totaux):")
mask = (covid_weekly['weekly_cases'].notna() & 
        covid_weekly['weekly_cases_from_total'].notna() & 
        (covid_weekly['weekly_cases'] > 0))
coherent_data = covid_weekly[mask]
if len(coherent_data) > 0:
    correlation = coherent_data['weekly_cases'].corr(coherent_data['weekly_cases_from_total'])
    print(f"  correlation: {correlation:.3f}")
    
    # calculer l'ecart moyen
    coherent_data['difference'] = abs(coherent_data['weekly_cases'] - coherent_data['weekly_cases_from_total'])
    avg_difference = coherent_data['difference'].mean()
    print(f"  ecart moyen: {avg_difference:,.0f}")

# exemple de donnees agregees
print("\nexemple de donnees agregees (france, 5 premieres semaines avec cas):")
france_sample = covid_weekly[
    (covid_weekly['location'] == 'France') & 
    (covid_weekly['weekly_cases'] > 0)
].head()
display_cols = ['location', 'year_week', 'weekly_cases', 'weekly_deaths', 
                'avg_reproduction_rate', 'avg_stringency_index', 'data_quality']
print(france_sample[display_cols].to_string(index=False))

# afficher les strategies de gestion des outliers
print("\nstrategies de gestion des outliers:")
print(f"  valeurs cap (99.9%): cases <= {cap_cases:,.0f}, deaths <= {cap_deaths:,.0f}")
print(f"  donnees marquees comme outliers: {covid_weekly['is_outlier'].sum():,} ({covid_weekly['is_outlier'].sum()/len(covid_weekly)*100:.1f}%)")
print("\n  distribution des poids de regression:")
weight_dist = covid_weekly['regression_weight'].value_counts().sort_index()
for weight, count in weight_dist.items():
    print(f"    poids {weight}: {count:,} lignes ({count/len(covid_weekly)*100:.1f}%)")

print("\nagregation hebdomadaire terminee avec succes")

agregation hebdomadaire des donnees

execution de l'agregation...

calcul des differences hebdomadaires...

resultats de l'agregation hebdomadaire
nombre total de lignes: 56,874
nombre de pays: 234
nombre de semaines: 243
plage temporelle: 2019-12-30 00:00:00 a 2024-08-18 00:00:00

qualite des donnees hebdomadaires:
  good: 56,130 (98.7%)
  incomplete: 689 (1.2%)
  extreme_values: 55 (0.1%)

statistiques des cas hebdomadaires:
  moyenne: 19,956
  mediane: 395
  max: 40,475,477

verification de coherence (weekly_cases vs differences de totaux):
  correlation: 1.000
  ecart moyen: 0

exemple de donnees agregees (france, 5 premieres semaines avec cas):
location year_week  weekly_cases  weekly_deaths  avg_reproduction_rate  avg_stringency_index data_quality
  France   2020-04         3.000          0.000                    NaN                 3.177         good
  France   2020-05         3.000          0.000                    NaN                 5.560         good
  France   2020-06      

In [10]:
# verification finale et sauvegarde des donnees
print("verification finale avant sauvegarde")
print("=" * 50)

# 1. analyser pourquoi tant de donnees sont marquees comme outliers
print("\n1. analyse des outliers")
outliers = covid_weekly[covid_weekly['is_outlier']]
print(f"   nombre total d'outliers: {len(outliers):,}")
print(f"   pays avec le plus d'outliers:")
outlier_by_country = outliers['location'].value_counts().head(10)
for country, count in outlier_by_country.items():
    total_weeks = len(covid_weekly[covid_weekly['location'] == country])
    print(f"     {country}: {count} outliers sur {total_weeks} semaines ({count/total_weeks*100:.1f}%)")

# 2. verifier la distribution des z-scores
print("\n2. distribution des z-scores")
print(f"   z-score cases - max: {covid_weekly['zscore_cases'].max():.2f}")
print(f"   z-score cases - min: {covid_weekly['zscore_cases'].min():.2f}")
print(f"   z-score > 3: {(covid_weekly['zscore_cases'] > 3).sum():,}")
print(f"   z-score < -3: {(covid_weekly['zscore_cases'] < -3).sum():,}")

# 3. ajuster le critere d'outlier pour etre moins strict
print("\n3. ajustement du critere d'outlier")
# utiliser un seuil plus eleve pour les outliers (z-score > 5 au lieu de 3)
covid_weekly['is_extreme_outlier'] = (abs(covid_weekly['zscore_cases']) > 5) | (abs(covid_weekly['zscore_deaths']) > 5)

# recalculer les poids avec le nouveau critere
covid_weekly['regression_weight_adjusted'] = 1.0
covid_weekly.loc[covid_weekly['data_quality'] == 'extreme_values', 'regression_weight_adjusted'] = 0.1
covid_weekly.loc[covid_weekly['data_quality'] == 'incomplete', 'regression_weight_adjusted'] = 0.5
covid_weekly.loc[covid_weekly['is_extreme_outlier'], 'regression_weight_adjusted'] = 0.3

print(f"   outliers extremes (z > 5): {covid_weekly['is_extreme_outlier'].sum():,} ({covid_weekly['is_extreme_outlier'].sum()/len(covid_weekly)*100:.1f}%)")
print("\n   distribution des poids ajustes:")
weight_dist = covid_weekly['regression_weight_adjusted'].value_counts().sort_index()
for weight, count in weight_dist.items():
    print(f"     poids {weight}: {count:,} lignes ({count/len(covid_weekly)*100:.1f}%)")

# 4. colonnes finales a sauvegarder
print("\n4. selection des colonnes pour la sauvegarde")
columns_to_save = [
    # identifiants
    'location', 'iso_code', 'continent',
    'year_week', 'week_start', 'week_end',
    
    # donnees principales
    'weekly_cases', 'weekly_deaths',
    'total_cases', 'total_deaths',
    
    # donnees ajustees
    'weekly_cases_capped', 'weekly_deaths_capped',
    'log_weekly_cases', 'log_weekly_deaths',
    
    # moyennes hebdomadaires
    'avg_daily_cases_smoothed', 'avg_daily_deaths_smoothed',
    'avg_cases_per_million', 'avg_deaths_per_million',
    'avg_reproduction_rate', 'avg_mortality_rate',
    'avg_stringency_index',
    
    # taux de croissance
    'cases_growth_rate', 'deaths_growth_rate',
    
    # donnees demographiques
    'population', 'population_density',
    'latitude', 'longitude',
    
    # indicateurs de qualite
    'days_with_data', 'data_quality',
    'is_outlier', 'is_extreme_outlier',
    'regression_weight', 'regression_weight_adjusted',
    'extreme_cases', 'extreme_deaths'
]

# verifier que toutes les colonnes existent
missing_cols = [col for col in columns_to_save if col not in covid_weekly.columns]
if missing_cols:
    print(f"   colonnes manquantes: {missing_cols}")
else:
    print(f"   toutes les {len(columns_to_save)} colonnes sont presentes")

# 5. sauvegarder les donnees
covid_weekly_final = covid_weekly[columns_to_save].copy()

# trier par pays et date
covid_weekly_final = covid_weekly_final.sort_values(['location', 'week_start'])

# sauvegarder
output_file = PROC_DATA_DIR2 / "covid_weekly_final.csv"
covid_weekly_final.to_csv(output_file, index=False)

print(f"\n5. sauvegarde terminee")
print(f"   fichier: {output_file}")
print(f"   taille: {len(covid_weekly_final):,} lignes")
print(f"   pays: {covid_weekly_final['location'].nunique()}")
print(f"   semaines: {covid_weekly_final['year_week'].nunique()}")

# afficher un echantillon
print("\n6. echantillon des donnees sauvegardees (5 premieres lignes):")
sample_cols = ['location', 'year_week', 'weekly_cases', 'weekly_deaths', 
               'regression_weight_adjusted', 'data_quality']
print(covid_weekly_final[sample_cols].head().to_string(index=False))

print("\n" + "=" * 50)
print("donnees hebdomadaires nettoyees et sauvegardees avec succes!")

verification finale avant sauvegarde

1. analyse des outliers
   nombre total d'outliers: 14,837
   pays avec le plus d'outliers:
     Mayotte: 117 outliers sur 243 semaines (48.1%)
     El Salvador: 117 outliers sur 243 semaines (48.1%)
     Egypt: 114 outliers sur 243 semaines (46.9%)
     Palestine: 108 outliers sur 243 semaines (44.4%)
     Algeria: 107 outliers sur 243 semaines (44.0%)
     Guernsey: 107 outliers sur 243 semaines (44.0%)
     Iraq: 106 outliers sur 243 semaines (43.6%)
     Jordan: 106 outliers sur 243 semaines (43.6%)
     Kazakhstan: 104 outliers sur 243 semaines (42.8%)
     Bangladesh: 104 outliers sur 243 semaines (42.8%)

2. distribution des z-scores
   z-score cases - max: 24527.80
   z-score cases - min: -1.03
   z-score > 3: 13,238
   z-score < -3: 0

3. ajustement du critere d'outlier
   outliers extremes (z > 5): 11,961 (21.0%)

   distribution des poids ajustes:
     poids 0.1: 2 lignes (0.0%)
     poids 0.3: 11,961 lignes (21.0%)
     poids 0.5: 526 l