In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import urllib.request
from io import StringIO
import requests
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [2]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://files.minsa.gob.pe/s/eRqxR35ZCxrzNgr/download')
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0')
content = urlopen(req)

test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
0,20220410,AREQUIPA,AREQUIPA,CAYMA,AG,48.0,FEMENINO,20210601.0,40103.0,13877833.0
1,20220410,AMAZONAS,BAGUA,ARAMANGO,PR,47.0,FEMENINO,20200904.0,10202.0,13877858.0
2,20220410,HUANUCO,,,AG,52.0,MASCULINO,20210706.0,,13877907.0
3,20220410,LIMA,LIMA,LINCE,PR,50.0,MASCULINO,20210304.0,150116.0,13877982.0
4,20220410,CUSCO,CUSCO,CUSCO,PCR,69.0,FEMENINO,20220115.0,80101.0,13878105.0
...,...,...,...,...,...,...,...,...,...,...
3551892,20220410,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,20220126.0,150101.0,
3551893,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3551894,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3551895,20220410,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,20220127.0,150101.0,


In [3]:
#content= "positivos_covid.csv"

#test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
#test

In [4]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
3551892,20220410,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,
3551893,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3551894,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3551895,20220410,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,
3551896,20220410,LIMA,LIMA,LIMA,PCR,33.0,FEMENINO,2022-01-26,150101.0,


In [5]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO          321
PROVINCIA          168810
DISTRITO           168810
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             168810
id_persona          63324
dtype: int64

In [6]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,44052,452125,9743
1,ANCASH,126942,1189403,10673
2,APURIMAC,38999,440629,8851
3,AREQUIPA,212835,1488247,14301
4,AYACUCHO,48118,658081,7312
5,CAJAMARCA,95842,1528904,6269
6,CALLAO,146517,1090990,13430
7,CUSCO,116800,1392648,8387
8,HUANCAVELICA,25726,414882,6201
9,HUANUCO,52129,823560,6330


In [7]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20220410,AREQUIPA,AREQUIPA,CAYMA,AG,48.0,FEMENINO,2021-06-01,40103.0,13877833.0,212835.0,1488247.0,14301.0
1,20220410,AMAZONAS,BAGUA,ARAMANGO,PR,47.0,FEMENINO,2020-09-04,10202.0,13877858.0,44052.0,452125.0,9743.0
2,20220410,HUANUCO,,,AG,52.0,MASCULINO,2021-07-06,,13877907.0,52129.0,823560.0,6330.0
3,20220410,LIMA,LIMA,LINCE,PR,50.0,MASCULINO,2021-03-04,150116.0,13877982.0,1607248.0,10741923.0,14962.0
4,20220410,CUSCO,CUSCO,CUSCO,PCR,69.0,FEMENINO,2022-01-15,80101.0,13878105.0,116800.0,1392648.0,8387.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3551892,20220410,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1607248.0,10741923.0,14962.0
3551893,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1607248.0,10741923.0,14962.0
3551894,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1607248.0,10741923.0,14962.0
3551895,20220410,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1607248.0,10741923.0,14962.0


In [8]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [9]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [10]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [11]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [12]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [13]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [14]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
2975937,LIMA,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,1607248.0,9674755.0,14962.0


In [15]:
#find_provincia = test['PROVINCIA'] == 'UCAYALI'
find_distrito = test['DEPARTAMENTO'] == 'LORETO'
find_fecha = test['FECHA_RESULTADO'] == '2022-01-02'

data_exploratoria = test[find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
618134,20220410,LORETO,MAYNAS,SAN JUAN BAUTISTA,PR,32.0,FEMENINO,2022-01-02,160113.0,20925980.0,57904.0,550551.0,5281.0
758549,20220410,LORETO,MAYNAS,PUNCHANA,PR,31.0,MASCULINO,2022-01-02,160108.0,22425039.0,57904.0,550551.0,5281.0
1670626,20220410,LORETO,MAYNAS,IQUITOS,AG,24.0,MASCULINO,2022-01-02,160101.0,34702609.0,57904.0,550551.0,5281.0
1711302,20220410,LORETO,MAYNAS,PUNCHANA,AG,27.0,MASCULINO,2022-01-02,160108.0,19990922.0,57904.0,550551.0,5281.0
2130098,20220410,LORETO,MAYNAS,IQUITOS,AG,69.0,FEMENINO,2022-01-02,160101.0,3428514.0,57904.0,550551.0,5281.0


In [16]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
FEMENINO,2,2,2,2,2,2,2,2,2,2,2,2
MASCULINO,3,3,3,3,3,3,3,3,3,3,3,3


In [17]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [18]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
2,HUANUCO,,,52.0,2021-07-06,,13877907.0,52129.0,,6330.0
24,LIMA,,,39.0,2021-06-23,,13866581.0,1607248.0,,14962.0
120,LIMA,,,18.0,2022-01-18,,13937253.0,1607248.0,,14962.0
139,AREQUIPA,,,55.0,2021-05-24,,13878044.0,212835.0,,14301.0
170,LIMA,,,45.0,2021-04-26,,13866494.0,1607248.0,,14962.0
...,...,...,...,...,...,...,...,...,...,...
3551686,HUANUCO,,,30.0,2022-01-25,,,52129.0,,6330.0
3551777,LIMA,,,29.0,2022-02-09,,,1607248.0,,14962.0
3551804,PIURA,,,50.0,2022-02-03,,,158286.0,,8048.0
3551807,LIMA,,,49.0,2022-02-02,,,1607248.0,,14962.0


In [19]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2021-06-01
1         2020-09-04
2         2021-07-06
3         2021-03-04
4         2022-01-15
             ...    
3551892   2022-01-26
3551893   2022-01-18
3551894   2022-01-18
3551895   2022-01-27
3551896   2022-01-26
Name: FECHA_RESULTADO, Length: 3551897, dtype: datetime64[ns]

In [20]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20220410,AREQUIPA,AREQUIPA,CAYMA,AG,48.0,FEMENINO,2021-06-01,40103.0,13877833.0,212835.0,1175765.0,14301.0,11.75765
1,20220410,AMAZONAS,BAGUA,ARAMANGO,PR,47.0,FEMENINO,2020-09-04,10202.0,13877858.0,44052.0,84672.0,9743.0,0.84672
2,20220410,HUANUCO,,,AG,52.0,MASCULINO,2021-07-06,,13877907.0,52129.0,,6330.0,
3,20220410,LIMA,LIMA,LINCE,PR,50.0,MASCULINO,2021-03-04,150116.0,13877982.0,1607248.0,9674755.0,14962.0,96.74755
4,20220410,CUSCO,CUSCO,CUSCO,PCR,69.0,FEMENINO,2022-01-15,80101.0,13878105.0,116800.0,511019.0,8387.0,5.11019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3551892,20220410,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1607248.0,9674755.0,14962.0,96.74755
3551893,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1607248.0,9674755.0,14962.0,96.74755
3551894,20220410,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1607248.0,9674755.0,14962.0,96.74755
3551895,20220410,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1607248.0,9674755.0,14962.0,96.74755


In [21]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2022-02-20"
end_date = "2022-04-10"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
2851024,20220410,LIMA,LIMA,LURIN,PCR,71.0,FEMENINO,2022-02-20,150119.0,9421874.0,1607248.0,9674755.0,14962.0,96.74755
2600759,20220410,HUANCAVELICA,HUANCAVELICA,HUANCAVELICA,PCR,37.0,FEMENINO,2022-02-20,90101.0,6869051.0,25726.0,121265.0,6201.0,1.21265
1685399,20220410,AMAZONAS,BAGUA,EL PARCO,AG,4.0,MASCULINO,2022-02-20,10204.0,19716411.0,44052.0,84672.0,9743.0,0.84672
1503464,20220410,LIMA,LIMA,SAN JUAN DE LURIGANCHO,PCR,25.0,FEMENINO,2022-02-20,150132.0,31966712.0,1607248.0,9674755.0,14962.0,96.74755
928569,20220410,HUANCAVELICA,HUANCAVELICA,YAULI,AG,23.0,MASCULINO,2022-02-20,90117.0,24200000.0,25726.0,121265.0,6201.0,1.21265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985946,20220410,LIMA,LIMA,MIRAFLORES,PCR,42.0,MASCULINO,2022-04-10,150122.0,11115053.0,1607248.0,9674755.0,14962.0,96.74755
1020149,20220410,LIMA,LIMA,SAN ISIDRO,PCR,22.0,FEMENINO,2022-04-10,150131.0,25268515.0,1607248.0,9674755.0,14962.0,96.74755
2292745,20220410,LIMA,LIMA,LIMA,PCR,71.0,MASCULINO,2022-04-10,150101.0,4671759.0,1607248.0,9674755.0,14962.0,96.74755
1994230,20220410,LIMA,LIMA,MIRAFLORES,PCR,68.0,MASCULINO,2022-04-10,150122.0,2162160.0,1607248.0,9674755.0,14962.0,96.74755


In [22]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
2851024,20220410,LIMA,LIMA,LURIN,PCR,71.0,FEMENINO,2022-02-20,150119.0,9421874.0,1607248.0,9674755.0,14962.0,96.74755
2600759,20220410,HUANCAVELICA,HUANCAVELICA,HUANCAVELICA,PCR,37.0,FEMENINO,2022-02-20,90101.0,6869051.0,25726.0,121265.0,6201.0,1.21265
1685399,20220410,AMAZONAS,BAGUA,EL PARCO,AG,4.0,MASCULINO,2022-02-20,10204.0,19716411.0,44052.0,84672.0,9743.0,0.84672
1503464,20220410,LIMA,LIMA,SAN JUAN DE LURIGANCHO,PCR,25.0,FEMENINO,2022-02-20,150132.0,31966712.0,1607248.0,9674755.0,14962.0,96.74755
928569,20220410,HUANCAVELICA,HUANCAVELICA,YAULI,AG,23.0,MASCULINO,2022-02-20,90117.0,24200000.0,25726.0,121265.0,6201.0,1.21265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985946,20220410,LIMA,LIMA,MIRAFLORES,PCR,42.0,MASCULINO,2022-04-10,150122.0,11115053.0,1607248.0,9674755.0,14962.0,96.74755
1020149,20220410,LIMA,LIMA,SAN ISIDRO,PCR,22.0,FEMENINO,2022-04-10,150131.0,25268515.0,1607248.0,9674755.0,14962.0,96.74755
2292745,20220410,LIMA,LIMA,LIMA,PCR,71.0,MASCULINO,2022-04-10,150101.0,4671759.0,1607248.0,9674755.0,14962.0,96.74755
1994230,20220410,LIMA,LIMA,MIRAFLORES,PCR,68.0,MASCULINO,2022-04-10,150122.0,2162160.0,1607248.0,9674755.0,14962.0,96.74755


In [23]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            9
PROVINCIA            1594
DISTRITO             1594
METODODX_x              0
EDAD                    0
SEXO                    0
FECHA_RESULTADO         0
UBIGEO               1594
id_persona           1410
METODODX_y              9
POBLACION            1640
INDICE                  9
POBLACION_CIENMIL    1640
dtype: int64

In [24]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9743.0,2022-02-20,4.0,3.0,7.0
1,JUNIN,HUANCAYO,595183.0,5.95183,9402.0,2022-02-20,33.0,20.0,53.0
2,ANCASH,POMABAMBA,26675.0,0.26675,10673.0,2022-02-20,1.0,2.0,3.0
3,JUNIN,CONCEPCION,59138.0,0.59138,9402.0,2022-02-20,3.0,3.0,6.0
4,JUNIN,CHUPACA,57604.0,0.57604,9402.0,2022-02-20,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
1195,CUSCO,ANTA,63131.0,0.63131,8387.0,2022-04-10,1.0,0.0,1.0
1196,MOQUEGUA,ILO,82686.0,0.82686,25384.0,2022-04-10,5.0,3.0,8.0
1197,JUNIN,SATIPO,239105.0,2.39105,9402.0,2022-04-10,26.0,27.0,53.0
1198,ANCASH,HUAYLAS,56557.0,0.56557,10673.0,2022-04-10,29.0,23.0,52.0


In [25]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-02-20,118,118,118,118,118,118,118,118
2022-02-27,180,180,180,180,180,180,180,180
2022-03-06,174,174,174,174,174,174,174,174
2022-03-13,167,167,167,167,167,167,167,167
2022-03-20,161,161,161,161,161,161,161,161
2022-03-27,148,148,148,148,148,148,148,148
2022-04-03,135,135,135,135,135,135,135,135
2022-04-10,117,117,117,117,117,117,117,117


In [26]:
cero_cases = weekly_sales['PROVINCIA']== "ATALAYA"
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL


In [27]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9743.0,2022-02-20,4.724112,3.543084,8.267196
1,JUNIN,HUANCAYO,595183.0,5.95183,9402.0,2022-02-20,5.544513,3.360311,8.904824
2,ANCASH,POMABAMBA,26675.0,0.26675,10673.0,2022-02-20,3.748828,7.497657,11.246485
3,JUNIN,CONCEPCION,59138.0,0.59138,9402.0,2022-02-20,5.072880,5.072880,10.145761
4,JUNIN,CHUPACA,57604.0,0.57604,9402.0,2022-02-20,1.735991,1.735991,3.471981
...,...,...,...,...,...,...,...,...,...
1195,CUSCO,ANTA,63131.0,0.63131,8387.0,2022-04-10,1.584008,0.000000,1.584008
1196,MOQUEGUA,ILO,82686.0,0.82686,25384.0,2022-04-10,6.046973,3.628184,9.675157
1197,JUNIN,SATIPO,239105.0,2.39105,9402.0,2022-04-10,10.873884,11.292110,22.165994
1198,ANCASH,HUAYLAS,56557.0,0.56557,10673.0,2022-04-10,51.275704,40.666938,91.942642


In [28]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9743.0,2022-02-20,5.0,4.0,9.0
1,JUNIN,HUANCAYO,595183.0,5.95183,9402.0,2022-02-20,6.0,3.0,9.0
2,ANCASH,POMABAMBA,26675.0,0.26675,10673.0,2022-02-20,4.0,7.0,11.0
3,JUNIN,CONCEPCION,59138.0,0.59138,9402.0,2022-02-20,5.0,5.0,10.0
4,JUNIN,CHUPACA,57604.0,0.57604,9402.0,2022-02-20,2.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...
1195,CUSCO,ANTA,63131.0,0.63131,8387.0,2022-04-10,2.0,0.0,2.0
1196,MOQUEGUA,ILO,82686.0,0.82686,25384.0,2022-04-10,6.0,4.0,10.0
1197,JUNIN,SATIPO,239105.0,2.39105,9402.0,2022-04-10,11.0,11.0,22.0
1198,ANCASH,HUAYLAS,56557.0,0.56557,10673.0,2022-04-10,51.0,41.0,92.0


In [29]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [30]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")