In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [2]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://cloud.minsa.gob.pe/s/AC2adyLkHCKjmfm/download')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

test = pd.read_csv(content , sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
print(test)

         FECHA_CORTE DEPARTAMENTO PROVINCIA              DISTRITO METODODX  \
0           20211016         LIMA      LIMA  SAN MARTIN DE PORRES       PR   
1           20211016          ICA     PISCO                 PISCO       PR   
2           20211016      HUANUCO   HUANUCO               HUANUCO       PR   
3           20211016       ANCASH     SANTA                 SANTA       AG   
4           20211016       ANCASH     SANTA        NUEVO CHIMBOTE       AG   
...              ...          ...       ...                   ...      ...   
2190004     20211016         LIMA       NaN                   NaN       AG   
2190005     20211016      HUANUCO   HUANUCO              AMARILIS      PCR   
2190006     20211016       TUMBES    TUMBES                TUMBES       PR   
2190007     20211016   LAMBAYEQUE  CHICLAYO   JOSE LEONARDO ORTIZ       PR   
2190008     20211016  LA LIBERTAD  TRUJILLO          LA ESPERANZA       PR   

         EDAD       SEXO  FECHA_RESULTADO    UBIGEO  id_persona

In [3]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
2190004,20211016,LIMA,,,AG,49.0,MASCULINO,2021-09-18,,
2190005,20211016,HUANUCO,HUANUCO,AMARILIS,PCR,32.0,FEMENINO,2021-01-17,100102.0,
2190006,20211016,TUMBES,TUMBES,TUMBES,PR,28.0,FEMENINO,2020-06-20,240101.0,
2190007,20211016,LAMBAYEQUE,CHICLAYO,JOSE LEONARDO ORTIZ,PR,56.0,FEMENINO,2020-05-20,140105.0,
2190008,20211016,LA LIBERTAD,TRUJILLO,LA ESPERANZA,PR,33.0,FEMENINO,2020-08-10,130105.0,


In [4]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA          108979
DISTRITO           108979
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             108979
id_persona          37195
dtype: int64

In [5]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,31705,426806,7428
1,ANCASH,77531,1180638,6567
2,APURIMAC,26213,430736,6086
3,AREQUIPA,115113,1497438,7687
4,AYACUCHO,33928,668213,5077
5,CAJAMARCA,65881,1453711,4532
6,CALLAO,101042,1129854,8943
7,CUSCO,73745,1357075,5434
8,HUANCAVELICA,16201,365317,4435
9,HUANUCO,35606,760267,4683


In [6]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20211016,LIMA,LIMA,SAN MARTIN DE PORRES,PR,25.0,MASCULINO,2020-12-17,150135.0,24662153.0,955159,10628470,8987
1,20211016,ICA,PISCO,PISCO,PR,20.0,FEMENINO,2020-08-22,110501.0,24662175.0,59004,975182,6051
2,20211016,HUANUCO,HUANUCO,HUANUCO,PR,22.0,FEMENINO,2020-07-29,100101.0,24662197.0,35606,760267,4683
3,20211016,ANCASH,SANTA,SANTA,AG,18.0,FEMENINO,2021-06-30,21808.0,24662204.0,77531,1180638,6567
4,20211016,ANCASH,SANTA,NUEVO CHIMBOTE,AG,17.0,MASCULINO,2021-04-04,21809.0,24662207.0,77531,1180638,6567
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2190004,20211016,LIMA,,,AG,49.0,MASCULINO,2021-09-18,,,955159,10628470,8987
2190005,20211016,HUANUCO,HUANUCO,AMARILIS,PCR,32.0,FEMENINO,2021-01-17,100102.0,,35606,760267,4683
2190006,20211016,TUMBES,TUMBES,TUMBES,PR,28.0,FEMENINO,2020-06-20,240101.0,,18841,251521,7491
2190007,20211016,LAMBAYEQUE,CHICLAYO,JOSE LEONARDO ORTIZ,PR,56.0,FEMENINO,2020-05-20,140105.0,,61664,1310785,4704


In [7]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [8]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [9]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [10]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [11]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [12]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [13]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
1607688,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,9674755.0


In [14]:
find_provincia = test['PROVINCIA'] == 'LIMA'
find_distrito = test['DISTRITO'] == 'LIMA'
find_fecha = test['FECHA_RESULTADO'] == '2021-03-13'

data_exploratoria = test[find_provincia & find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
5301,20211016,LIMA,LIMA,LIMA,AG,32.0,MASCULINO,2021-03-13,150101.0,24632508.0,955159,9674755.0,8987
27569,20211016,LIMA,LIMA,LIMA,AG,25.0,FEMENINO,2021-03-13,150101.0,25824674.0,955159,9674755.0,8987
30633,20211016,LIMA,LIMA,LIMA,AG,37.0,MASCULINO,2021-03-13,150101.0,25323679.0,955159,9674755.0,8987
34818,20211016,LIMA,LIMA,LIMA,AG,35.0,FEMENINO,2021-03-13,150101.0,19505698.0,955159,9674755.0,8987
36590,20211016,LIMA,LIMA,LIMA,AG,21.0,MASCULINO,2021-03-13,150101.0,19533597.0,955159,9674755.0,8987
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2125966,20211016,LIMA,LIMA,LIMA,AG,47.0,MASCULINO,2021-03-13,150101.0,19199255.0,955159,9674755.0,8987
2153624,20211016,LIMA,LIMA,LIMA,PCR,0.0,FEMENINO,2021-03-13,150101.0,,955159,9674755.0,8987
2160459,20211016,LIMA,LIMA,LIMA,PCR,38.0,FEMENINO,2021-03-13,150101.0,,955159,9674755.0,8987
2167220,20211016,LIMA,LIMA,LIMA,PCR,25.0,MASCULINO,2021-03-13,150101.0,,955159,9674755.0,8987


In [15]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
FEMENINO,88,88,88,88,88,88,88,88,85,88,88,88
MASCULINO,87,87,87,87,87,87,87,87,86,87,87,87


In [16]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [17]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
16,,,35.0,2021-02-06,,24662423.0,
31,,,36.0,2020-09-12,,24692780.0,
50,,,27.0,2021-02-26,,24769312.0,
66,,,25.0,2020-08-04,,24662547.0,
75,,,37.0,2020-08-07,,24662598.0,
...,...,...,...,...,...,...,...
2189774,,,24.0,2021-08-24,,,
2189805,,,63.0,2021-08-23,,,
2189917,,,62.0,2021-09-17,,,
2189992,,,37.0,2021-05-10,,,


In [18]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2020-12-17
1         2020-08-22
2         2020-07-29
3         2021-06-30
4         2021-04-04
             ...    
2190004   2021-09-18
2190005   2021-01-17
2190006   2020-06-20
2190007   2020-05-20
2190008   2020-08-10
Name: FECHA_RESULTADO, Length: 2190009, dtype: datetime64[ns]

In [19]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20211016,LIMA,LIMA,SAN MARTIN DE PORRES,PR,25.0,MASCULINO,2020-12-17,150135.0,24662153.0,955159,9674755.0,8987,96.74755
1,20211016,ICA,PISCO,PISCO,PR,20.0,FEMENINO,2020-08-22,110501.0,24662175.0,59004,174016.0,6051,1.74016
2,20211016,HUANUCO,HUANUCO,HUANUCO,PR,22.0,FEMENINO,2020-07-29,100101.0,24662197.0,35606,315799.0,4683,3.15799
3,20211016,ANCASH,SANTA,SANTA,AG,18.0,FEMENINO,2021-06-30,21808.0,24662204.0,77531,474053.0,6567,4.74053
4,20211016,ANCASH,SANTA,NUEVO CHIMBOTE,AG,17.0,MASCULINO,2021-04-04,21809.0,24662207.0,77531,474053.0,6567,4.74053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2190004,20211016,LIMA,,,AG,49.0,MASCULINO,2021-09-18,,,955159,,8987,
2190005,20211016,HUANUCO,HUANUCO,AMARILIS,PCR,32.0,FEMENINO,2021-01-17,100102.0,,35606,315799.0,4683,3.15799
2190006,20211016,TUMBES,TUMBES,TUMBES,PR,28.0,FEMENINO,2020-06-20,240101.0,,18841,171356.0,7491,1.71356
2190007,20211016,LAMBAYEQUE,CHICLAYO,JOSE LEONARDO ORTIZ,PR,56.0,FEMENINO,2020-05-20,140105.0,,61664,862709.0,4704,8.62709


In [37]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2021-08-23"
end_date = "2021-10-17"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1914726,20211016,AREQUIPA,AREQUIPA,PAUCARPATA,AG,40.0,MASCULINO,2021-08-23,40112.0,15846210.0,115113,1175765.0,7687,11.75765
2129446,20211016,CAJAMARCA,CHOTA,CHOTA,AG,44.0,FEMENINO,2021-08-23,60401.0,19249150.0,65881,151714.0,4532,1.51714
941162,20211016,LIMA,LIMA,LIMA,PCR,47.0,MASCULINO,2021-08-23,150101.0,1699213.0,955159,9674755.0,8987,96.74755
54589,20211016,HUANUCO,AMBO,CONCHAMARCA,AG,38.0,MASCULINO,2021-08-23,100204.0,19800088.0,35606,53247.0,4683,0.53247
433103,20211016,LIMA,LIMA,RIMAC,AG,56.0,FEMENINO,2021-08-23,150128.0,28920321.0,955159,9674755.0,8987,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526428,20211016,LA LIBERTAD,GRAN CHIMU,CASCAS,PCR,84.0,MASCULINO,2021-10-16,131101.0,30385621.0,88526,28290.0,4389,0.28290
708514,20211016,LIMA,LIMA,SAN JUAN DE LURIGANCHO,AG,13.0,FEMENINO,2021-10-16,150132.0,35987941.0,955159,9674755.0,8987,96.74755
1691017,20211016,LAMBAYEQUE,CHICLAYO,CHICLAYO,PCR,50.0,FEMENINO,2021-10-16,140101.0,12167411.0,61664,862709.0,4704,8.62709
1343355,20211016,LA LIBERTAD,OTUZCO,CHARAT,AG,86.0,FEMENINO,2021-10-16,130604.0,6255396.0,88526,85091.0,4389,0.85091


In [38]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1914726,20211016,AREQUIPA,AREQUIPA,PAUCARPATA,AG,40.0,MASCULINO,2021-08-23,40112.0,15846210.0,115113,1175765.0,7687,11.75765
2129446,20211016,CAJAMARCA,CHOTA,CHOTA,AG,44.0,FEMENINO,2021-08-23,60401.0,19249150.0,65881,151714.0,4532,1.51714
941162,20211016,LIMA,LIMA,LIMA,PCR,47.0,MASCULINO,2021-08-23,150101.0,1699213.0,955159,9674755.0,8987,96.74755
54589,20211016,HUANUCO,AMBO,CONCHAMARCA,AG,38.0,MASCULINO,2021-08-23,100204.0,19800088.0,35606,53247.0,4683,0.53247
433103,20211016,LIMA,LIMA,RIMAC,AG,56.0,FEMENINO,2021-08-23,150128.0,28920321.0,955159,9674755.0,8987,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526428,20211016,LA LIBERTAD,GRAN CHIMU,CASCAS,PCR,84.0,MASCULINO,2021-10-16,131101.0,30385621.0,88526,28290.0,4389,0.28290
708514,20211016,LIMA,LIMA,SAN JUAN DE LURIGANCHO,AG,13.0,FEMENINO,2021-10-16,150132.0,35987941.0,955159,9674755.0,8987,96.74755
1691017,20211016,LAMBAYEQUE,CHICLAYO,CHICLAYO,PCR,50.0,FEMENINO,2021-10-16,140101.0,12167411.0,61664,862709.0,4704,8.62709
1343355,20211016,LA LIBERTAD,OTUZCO,CHARAT,AG,86.0,FEMENINO,2021-10-16,130604.0,6255396.0,88526,85091.0,4389,0.85091


In [39]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA            1265
DISTRITO             1265
METODODX_x              0
EDAD                    0
SEXO                    0
FECHA_RESULTADO         0
UBIGEO               1265
id_persona           1414
METODODX_y              0
POBLACION            1290
INDICE                  0
POBLACION_CIENMIL    1290
dtype: int64

In [40]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7428,2021-08-29,8.0,6.0,14.0
1,HUANCAVELICA,CASTROVIRREYNA,14588.0,0.14588,4435,2021-08-29,1.0,0.0,1.0
2,HUANCAVELICA,ANGARAES,53901.0,0.53901,4435,2021-08-29,1.0,1.0,2.0
3,HUANCAVELICA,ACOBAMBA,37503.0,0.37503,4435,2021-08-29,0.0,1.0,1.0
4,CUSCO,URUBAMBA,70043.0,0.70043,5434,2021-08-29,5.0,3.0,8.0
...,...,...,...,...,...,...,...,...,...
1252,ANCASH,ASUNCION,7710.0,0.07710,6567,2021-10-17,0.0,1.0,1.0
1253,PUNO,SAN ANTONIO DE PUTINA,34734.0,0.34734,3262,2021-10-17,0.0,2.0,2.0
1254,LAMBAYEQUE,LAMBAYEQUE,340835.0,3.40835,4704,2021-10-17,6.0,7.0,13.0
1255,CAJAMARCA,JAEN,203724.0,2.03724,4532,2021-10-17,3.0,2.0,5.0


In [41]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-08-29,170,170,170,170,170,170,170,170
2021-09-05,161,161,161,161,161,161,161,161
2021-09-12,159,159,159,159,159,159,159,159
2021-09-19,156,156,156,156,156,156,156,156
2021-09-26,160,160,160,160,160,160,160,160
2021-10-03,151,151,151,151,151,151,151,151
2021-10-10,153,153,153,153,153,153,153,153
2021-10-17,147,147,147,147,147,147,147,147


In [42]:
cero_cases = weekly_sales['TOTAL']==0
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL


In [43]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7428,2021-08-29,9.448224,7.086168,16.534392
1,HUANCAVELICA,CASTROVIRREYNA,14588.0,0.14588,4435,2021-08-29,6.854949,0.000000,6.854949
2,HUANCAVELICA,ANGARAES,53901.0,0.53901,4435,2021-08-29,1.855253,1.855253,3.710506
3,HUANCAVELICA,ACOBAMBA,37503.0,0.37503,4435,2021-08-29,0.000000,2.666453,2.666453
4,CUSCO,URUBAMBA,70043.0,0.70043,5434,2021-08-29,7.138472,4.283083,11.421555
...,...,...,...,...,...,...,...,...,...
1252,ANCASH,ASUNCION,7710.0,0.07710,6567,2021-10-17,0.000000,12.970169,12.970169
1253,PUNO,SAN ANTONIO DE PUTINA,34734.0,0.34734,3262,2021-10-17,0.000000,5.758047,5.758047
1254,LAMBAYEQUE,LAMBAYEQUE,340835.0,3.40835,4704,2021-10-17,1.760383,2.053780,3.814162
1255,CAJAMARCA,JAEN,203724.0,2.03724,4532,2021-10-17,1.472581,0.981720,2.454301


In [44]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7428,2021-08-29,9.0,7.0,16.0
1,HUANCAVELICA,CASTROVIRREYNA,14588.0,0.14588,4435,2021-08-29,7.0,0.0,7.0
2,HUANCAVELICA,ANGARAES,53901.0,0.53901,4435,2021-08-29,2.0,2.0,4.0
3,HUANCAVELICA,ACOBAMBA,37503.0,0.37503,4435,2021-08-29,0.0,3.0,3.0
4,CUSCO,URUBAMBA,70043.0,0.70043,5434,2021-08-29,7.0,4.0,11.0
...,...,...,...,...,...,...,...,...,...
1252,ANCASH,ASUNCION,7710.0,0.07710,6567,2021-10-17,0.0,13.0,13.0
1253,PUNO,SAN ANTONIO DE PUTINA,34734.0,0.34734,3262,2021-10-17,0.0,6.0,6.0
1254,LAMBAYEQUE,LAMBAYEQUE,340835.0,3.40835,4704,2021-10-17,2.0,2.0,4.0
1255,CAJAMARCA,JAEN,203724.0,2.03724,4532,2021-10-17,1.0,1.0,2.0


In [45]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [46]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")