In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [2]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://cloud.minsa.gob.pe/s/AC2adyLkHCKjmfm/download')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

test = pd.read_csv(content , sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
print(test)

         FECHA_CORTE DEPARTAMENTO PROVINCIA              DISTRITO METODODX  \
0           20220101         LIMA      LIMA  SAN MARTIN DE PORRES       PR   
1           20220101          ICA     PISCO                 PISCO       PR   
2           20220101      HUANUCO   HUANUCO               HUANUCO       PR   
3           20220101       ANCASH     SANTA                 SANTA       AG   
4           20220101       ANCASH     SANTA        NUEVO CHIMBOTE       AG   
...              ...          ...       ...                   ...      ...   
2302673     20220101  LA LIBERTAD  TRUJILLO              TRUJILLO       AG   
2302674     20220101         LIMA      LIMA                  LIMA      PCR   
2302675     20220101       CALLAO    CALLAO            BELLAVISTA      PCR   
2302676     20220101         LIMA      LIMA           EL AGUSTINO       PR   
2302677     20220101       CALLAO    CALLAO            BELLAVISTA      PCR   

         EDAD       SEXO  FECHA_RESULTADO    UBIGEO  id_persona

In [3]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
2302673,20220101,LA LIBERTAD,TRUJILLO,TRUJILLO,AG,36.0,MASCULINO,2021-11-18,130101.0,
2302674,20220101,LIMA,LIMA,LIMA,PCR,47.0,MASCULINO,2021-05-12,150101.0,
2302675,20220101,CALLAO,CALLAO,BELLAVISTA,PCR,27.0,FEMENINO,2021-04-01,70102.0,
2302676,20220101,LIMA,LIMA,EL AGUSTINO,PR,22.0,FEMENINO,2020-11-03,150111.0,
2302677,20220101,CALLAO,CALLAO,BELLAVISTA,PCR,17.0,MASCULINO,2021-03-29,70102.0,


In [4]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA          112475
DISTRITO           112475
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             112475
id_persona          40362
dtype: int64

In [5]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,33050,426806,7744
1,ANCASH,81710,1180638,6921
2,APURIMAC,26865,430736,6237
3,AREQUIPA,118364,1497438,7904
4,AYACUCHO,35200,668213,5268
5,CAJAMARCA,67727,1453711,4659
6,CALLAO,104658,1129854,9263
7,CUSCO,76462,1357075,5634
8,HUANCAVELICA,16669,365317,4563
9,HUANUCO,37044,760267,4872


In [6]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20220101,LIMA,LIMA,SAN MARTIN DE PORRES,PR,25.0,MASCULINO,2020-12-17,150135.0,24662153.0,1015986,10628470,9559
1,20220101,ICA,PISCO,PISCO,PR,20.0,FEMENINO,2020-08-22,110501.0,24662175.0,61802,975182,6337
2,20220101,HUANUCO,HUANUCO,HUANUCO,PR,22.0,FEMENINO,2020-07-29,100101.0,24662197.0,37044,760267,4872
3,20220101,ANCASH,SANTA,SANTA,AG,18.0,FEMENINO,2021-06-30,21808.0,24662204.0,81710,1180638,6921
4,20220101,ANCASH,SANTA,NUEVO CHIMBOTE,AG,17.0,MASCULINO,2021-04-04,21809.0,24662207.0,81710,1180638,6921
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2302673,20220101,LA LIBERTAD,TRUJILLO,TRUJILLO,AG,36.0,MASCULINO,2021-11-18,130101.0,,94150,2016771,4668
2302674,20220101,LIMA,LIMA,LIMA,PCR,47.0,MASCULINO,2021-05-12,150101.0,,1015986,10628470,9559
2302675,20220101,CALLAO,CALLAO,BELLAVISTA,PCR,27.0,FEMENINO,2021-04-01,70102.0,,104658,1129854,9263
2302676,20220101,LIMA,LIMA,EL AGUSTINO,PR,22.0,FEMENINO,2020-11-03,150111.0,,1015986,10628470,9559


In [7]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [8]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [9]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [10]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [11]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [12]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [13]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
1695533,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,9674755.0


In [31]:
find_provincia = test['PROVINCIA'] == 'UCAYALI'
find_distrito = test['DISTRITO'] == 'LORETO'
find_fecha = test['FECHA_RESULTADO'] == '2021-12-19'

data_exploratoria = test[find_provincia & find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL


In [15]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [16]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [17]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
16,,,35.0,2021-02-06,,24662423.0,
22,,,25.0,2020-08-04,,24662547.0,
31,,,37.0,2020-08-07,,24662598.0,
50,,,27.0,2021-02-26,,24769312.0,
69,,,39.0,2020-07-24,,24833739.0,
...,...,...,...,...,...,...,...
2302463,,,40.0,2021-03-30,,,
2302465,,,36.0,2021-03-15,,,
2302487,,,70.0,2020-12-05,,,
2302534,,,50.0,2021-03-16,,,


In [18]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2020-12-17
1         2020-08-22
2         2020-07-29
3         2021-06-30
4         2021-04-04
             ...    
2302673   2021-11-18
2302674   2021-05-12
2302675   2021-04-01
2302676   2020-11-03
2302677   2021-03-29
Name: FECHA_RESULTADO, Length: 2302678, dtype: datetime64[ns]

In [19]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20220101,LIMA,LIMA,SAN MARTIN DE PORRES,PR,25.0,MASCULINO,2020-12-17,150135.0,24662153.0,1015986,9674755.0,9559,96.74755
1,20220101,ICA,PISCO,PISCO,PR,20.0,FEMENINO,2020-08-22,110501.0,24662175.0,61802,174016.0,6337,1.74016
2,20220101,HUANUCO,HUANUCO,HUANUCO,PR,22.0,FEMENINO,2020-07-29,100101.0,24662197.0,37044,315799.0,4872,3.15799
3,20220101,ANCASH,SANTA,SANTA,AG,18.0,FEMENINO,2021-06-30,21808.0,24662204.0,81710,474053.0,6921,4.74053
4,20220101,ANCASH,SANTA,NUEVO CHIMBOTE,AG,17.0,MASCULINO,2021-04-04,21809.0,24662207.0,81710,474053.0,6921,4.74053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2302673,20220101,LA LIBERTAD,TRUJILLO,TRUJILLO,AG,36.0,MASCULINO,2021-11-18,130101.0,,94150,1118724.0,4668,11.18724
2302674,20220101,LIMA,LIMA,LIMA,PCR,47.0,MASCULINO,2021-05-12,150101.0,,1015986,9674755.0,9559,96.74755
2302675,20220101,CALLAO,CALLAO,BELLAVISTA,PCR,27.0,FEMENINO,2021-04-01,70102.0,,104658,1129854.0,9263,11.29854
2302676,20220101,LIMA,LIMA,EL AGUSTINO,PR,22.0,FEMENINO,2020-11-03,150111.0,,1015986,9674755.0,9559,96.74755


In [32]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2021-11-15"
end_date = "2022-01-02"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1440791,20220101,PIURA,SULLANA,SULLANA,PCR,15.0,FEMENINO,2021-11-15,200601.0,6509276.0,95473,341490.0,4662,3.41490
1286530,20220101,LIMA,LIMA,ATE,AG,72.0,FEMENINO,2021-11-15,150103.0,5140842.0,1015986,9674755.0,9559,96.74755
60392,20220101,LIMA,LIMA,SAN JUAN DE LURIGANCHO,AG,38.0,MASCULINO,2021-11-15,150132.0,19734878.0,1015986,9674755.0,9559,96.74755
199848,20220101,LIMA,LIMA,LIMA,AG,29.0,FEMENINO,2021-11-15,150101.0,21742465.0,1015986,9674755.0,9559,96.74755
202270,20220101,AREQUIPA,AREQUIPA,AREQUIPA,PCR,43.0,FEMENINO,2021-11-15,40101.0,22494828.0,118364,1175765.0,7904,11.75765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267528,20220101,LIMA,LIMA,SANTIAGO DE SURCO,PCR,25.0,FEMENINO,2022-01-01,150140.0,25457136.0,1015986,9674755.0,9559,96.74755
1222787,20220101,LIMA,LIMA,LURIGANCHO,PCR,36.0,FEMENINO,2022-01-01,150118.0,4181630.0,1015986,9674755.0,9559,96.74755
1927577,20220101,CAJAMARCA,JAEN,JAEN,PCR,94.0,FEMENINO,2022-01-01,60801.0,14151859.0,67727,203724.0,4659,2.03724
1568528,20220101,LIMA,LIMA,CHORRILLOS,PCR,23.0,FEMENINO,2022-01-01,150108.0,8978636.0,1015986,9674755.0,9559,96.74755


In [33]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1440791,20220101,PIURA,SULLANA,SULLANA,PCR,15.0,FEMENINO,2021-11-15,200601.0,6509276.0,95473,341490.0,4662,3.41490
1286530,20220101,LIMA,LIMA,ATE,AG,72.0,FEMENINO,2021-11-15,150103.0,5140842.0,1015986,9674755.0,9559,96.74755
60392,20220101,LIMA,LIMA,SAN JUAN DE LURIGANCHO,AG,38.0,MASCULINO,2021-11-15,150132.0,19734878.0,1015986,9674755.0,9559,96.74755
199848,20220101,LIMA,LIMA,LIMA,AG,29.0,FEMENINO,2021-11-15,150101.0,21742465.0,1015986,9674755.0,9559,96.74755
202270,20220101,AREQUIPA,AREQUIPA,AREQUIPA,PCR,43.0,FEMENINO,2021-11-15,40101.0,22494828.0,118364,1175765.0,7904,11.75765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267528,20220101,LIMA,LIMA,SANTIAGO DE SURCO,PCR,25.0,FEMENINO,2022-01-01,150140.0,25457136.0,1015986,9674755.0,9559,96.74755
1222787,20220101,LIMA,LIMA,LURIGANCHO,PCR,36.0,FEMENINO,2022-01-01,150118.0,4181630.0,1015986,9674755.0,9559,96.74755
1927577,20220101,CAJAMARCA,JAEN,JAEN,PCR,94.0,FEMENINO,2022-01-01,60801.0,14151859.0,67727,203724.0,4659,2.03724
1568528,20220101,LIMA,LIMA,CHORRILLOS,PCR,23.0,FEMENINO,2022-01-01,150108.0,8978636.0,1015986,9674755.0,9559,96.74755


In [34]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA            1852
DISTRITO             1852
METODODX_x              0
EDAD                    0
SEXO                    0
FECHA_RESULTADO         0
UBIGEO               1852
id_persona           2295
METODODX_y              0
POBLACION            1967
INDICE                  0
POBLACION_CIENMIL    1967
dtype: int64

In [35]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7744,2021-11-21,27.0,25.0,52.0
1,HUANCAVELICA,CHURCAMPA,33883.0,0.33883,4563,2021-11-21,1.0,2.0,3.0
2,HUANCAVELICA,CASTROVIRREYNA,14588.0,0.14588,4563,2021-11-21,1.0,0.0,1.0
3,HUANCAVELICA,ANGARAES,53901.0,0.53901,4563,2021-11-21,3.0,3.0,6.0
4,HUANCAVELICA,ACOBAMBA,37503.0,0.37503,4563,2021-11-21,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
1134,HUANUCO,HUANUCO,315799.0,3.15799,4872,2022-01-02,40.0,33.0,73.0
1135,HUANUCO,HUAMALIES,52095.0,0.52095,4872,2022-01-02,1.0,0.0,1.0
1136,PUNO,EL COLLAO,66287.0,0.66287,3425,2022-01-02,4.0,9.0,13.0
1137,ANCASH,CASMA,57256.0,0.57256,6921,2022-01-02,2.0,1.0,3.0


In [36]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-11-21,166,166,166,166,166,166,166,166
2021-11-28,161,161,161,161,161,161,161,161
2021-12-05,163,163,163,163,163,163,163,163
2021-12-12,164,164,164,164,164,164,164,164
2021-12-19,159,159,159,159,159,159,159,159
2021-12-26,159,159,159,159,159,159,159,159
2022-01-02,167,167,167,167,167,167,167,167


In [25]:
cero_cases = weekly_sales['TOTAL']==0
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL


In [26]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7744,2021-11-14,14.172336,37.792895,51.965231
1,APURIMAC,COTABAMBAS,55208.0,0.55208,6237,2021-11-14,9.056658,14.490654,23.547312
2,LIMA,HUAROCHIRI,62381.0,0.62381,9559,2021-11-14,30.457992,20.839679,51.297671
3,LIMA,HUARAL,194375.0,1.94375,9559,2021-11-14,13.376206,15.948553,29.324759
4,APURIMAC,GRAU,21759.0,0.21759,6237,2021-11-14,9.191599,13.787398,22.978997
...,...,...,...,...,...,...,...,...,...
1296,LA LIBERTAD,BOLIVAR,15982.0,0.15982,4668,2022-01-02,12.514078,0.000000,12.514078
1297,LA LIBERTAD,ASCOPE,123480.0,1.23480,4668,2022-01-02,4.049239,4.049239,8.098477
1298,PUNO,MELGAR,69693.0,0.69693,3425,2022-01-02,7.174322,4.304593,11.478915
1299,AMAZONAS,UTCUBAMBA,119294.0,1.19294,7744,2022-01-02,13.412242,19.280098,32.692340


In [27]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7744,2021-11-14,14.0,38.0,52.0
1,APURIMAC,COTABAMBAS,55208.0,0.55208,6237,2021-11-14,9.0,14.0,23.0
2,LIMA,HUAROCHIRI,62381.0,0.62381,9559,2021-11-14,30.0,21.0,51.0
3,LIMA,HUARAL,194375.0,1.94375,9559,2021-11-14,13.0,16.0,29.0
4,APURIMAC,GRAU,21759.0,0.21759,6237,2021-11-14,9.0,14.0,23.0
...,...,...,...,...,...,...,...,...,...
1296,LA LIBERTAD,BOLIVAR,15982.0,0.15982,4668,2022-01-02,13.0,0.0,13.0
1297,LA LIBERTAD,ASCOPE,123480.0,1.23480,4668,2022-01-02,4.0,4.0,8.0
1298,PUNO,MELGAR,69693.0,0.69693,3425,2022-01-02,7.0,4.0,11.0
1299,AMAZONAS,UTCUBAMBA,119294.0,1.19294,7744,2022-01-02,13.0,19.0,32.0


In [28]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [29]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")