In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import urllib.request
from io import StringIO
import requests
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [2]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://files.minsa.gob.pe/s/eRqxR35ZCxrzNgr/download')
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0')
content = urlopen(req)

test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
0,20220406,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,20210419.0,150113.0,13866369.0
1,20220406,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,20210429.0,120114.0,13866556.0
2,20220406,LIMA,,,AG,39.0,FEMENINO,20210702.0,,13866581.0
3,20220406,CAJAMARCA,JAEN,PUCARA,AG,38.0,FEMENINO,20220126.0,60808.0,13866692.0
4,20220406,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,20210831.0,150113.0,13865937.0
...,...,...,...,...,...,...,...,...,...,...
3550235,20220406,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,20220126.0,150101.0,
3550236,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3550237,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3550238,20220406,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,20220127.0,150101.0,


In [3]:
#content= "positivos_covid.csv"

#test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
#test

In [4]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
3550235,20220406,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,
3550236,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3550237,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3550238,20220406,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,
3550239,20220406,LIMA,LIMA,LIMA,PCR,33.0,FEMENINO,2022-01-26,150101.0,


In [5]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO          321
PROVINCIA          168765
DISTRITO           168765
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             168765
id_persona          63195
dtype: int64

In [6]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,44040,452125,9741
1,ANCASH,126753,1189403,10657
2,APURIMAC,38985,440629,8848
3,AREQUIPA,212768,1488247,14297
4,AYACUCHO,48112,658081,7311
5,CAJAMARCA,95823,1528904,6267
6,CALLAO,146472,1090990,13426
7,CUSCO,116758,1392648,8384
8,HUANCAVELICA,25714,414882,6198
9,HUANUCO,52114,823560,6328


In [7]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20220406,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,2021-04-19,150113.0,13866369.0,1606321.0,10741923.0,14954.0
1,20220406,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,2021-04-29,120114.0,13866556.0,125928.0,1340064.0,9397.0
2,20220406,LIMA,,,AG,39.0,FEMENINO,2021-07-02,,13866581.0,1606321.0,10741923.0,14954.0
3,20220406,CAJAMARCA,JAEN,PUCARA,AG,38.0,FEMENINO,2022-01-26,60808.0,13866692.0,95823.0,1528904.0,6267.0
4,20220406,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,2021-08-31,150113.0,13865937.0,1606321.0,10741923.0,14954.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550235,20220406,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1606321.0,10741923.0,14954.0
3550236,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1606321.0,10741923.0,14954.0
3550237,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1606321.0,10741923.0,14954.0
3550238,20220406,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1606321.0,10741923.0,14954.0


In [8]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [9]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [10]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [11]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [12]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [13]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [14]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
2989611,LIMA,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,1606321.0,9674755.0,14954.0


In [15]:
#find_provincia = test['PROVINCIA'] == 'UCAYALI'
find_distrito = test['DEPARTAMENTO'] == 'LORETO'
find_fecha = test['FECHA_RESULTADO'] == '2022-01-02'

data_exploratoria = test[find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
623953,20220406,LORETO,MAYNAS,SAN JUAN BAUTISTA,PR,32.0,FEMENINO,2022-01-02,160113.0,20925980.0,57889.0,550551.0,5280.0
818151,20220406,LORETO,MAYNAS,PUNCHANA,PR,31.0,MASCULINO,2022-01-02,160108.0,22425039.0,57889.0,550551.0,5280.0
1628679,20220406,LORETO,MAYNAS,IQUITOS,AG,24.0,MASCULINO,2022-01-02,160101.0,34702609.0,57889.0,550551.0,5280.0
1740579,20220406,LORETO,MAYNAS,PUNCHANA,AG,27.0,MASCULINO,2022-01-02,160108.0,19990922.0,57889.0,550551.0,5280.0
2190092,20220406,LORETO,MAYNAS,IQUITOS,AG,69.0,FEMENINO,2022-01-02,160101.0,3428514.0,57889.0,550551.0,5280.0


In [16]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
FEMENINO,2,2,2,2,2,2,2,2,2,2,2,2
MASCULINO,3,3,3,3,3,3,3,3,3,3,3,3


In [17]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [18]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
2,LIMA,,,39.0,2021-07-02,,13866581.0,1606321.0,,14954.0
68,HUANUCO,,,52.0,2021-07-06,,13877907.0,52114.0,,6328.0
183,LIMA,,,18.0,2022-01-18,,13937253.0,1606321.0,,14954.0
212,LIMA,,,68.0,2021-03-01,,13925493.0,1606321.0,,14954.0
257,LIMA,,,45.0,2021-04-26,,13866494.0,1606321.0,,14954.0
...,...,...,...,...,...,...,...,...,...,...
3550085,LIMA,,,46.0,2022-01-24,,,1606321.0,,14954.0
3550086,ICA,,,29.0,2022-01-31,,,103613.0,,11643.0
3550087,ICA,,,29.0,2022-01-31,,,103613.0,,11643.0
3550088,ICA,,,24.0,2022-01-25,,,103613.0,,11643.0


In [19]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2021-04-19
1         2021-04-29
2         2021-07-02
3         2022-01-26
4         2021-08-31
             ...    
3550235   2022-01-26
3550236   2022-01-18
3550237   2022-01-18
3550238   2022-01-27
3550239   2022-01-26
Name: FECHA_RESULTADO, Length: 3550240, dtype: datetime64[ns]

In [20]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20220406,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,2021-04-19,150113.0,13866369.0,1606321.0,9674755.0,14954.0,96.74755
1,20220406,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,2021-04-29,120114.0,13866556.0,125928.0,595183.0,9397.0,5.95183
2,20220406,LIMA,,,AG,39.0,FEMENINO,2021-07-02,,13866581.0,1606321.0,,14954.0,
3,20220406,CAJAMARCA,JAEN,PUCARA,AG,38.0,FEMENINO,2022-01-26,60808.0,13866692.0,95823.0,203724.0,6267.0,2.03724
4,20220406,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,2021-08-31,150113.0,13865937.0,1606321.0,9674755.0,14954.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550235,20220406,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1606321.0,9674755.0,14954.0,96.74755
3550236,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1606321.0,9674755.0,14954.0,96.74755
3550237,20220406,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1606321.0,9674755.0,14954.0,96.74755
3550238,20220406,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1606321.0,9674755.0,14954.0,96.74755


In [21]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2022-02-13"
end_date = "2022-04-03"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
2469064,20220406,LIMA,LIMA,SAN BORJA,AG,49.0,MASCULINO,2022-02-13,150130.0,5937772.0,1606321.0,9674755.0,14954.0,96.74755
2941365,20220406,LA LIBERTAD,TRUJILLO,FLORENCIA DE MORA,AG,4.0,MASCULINO,2022-02-13,130103.0,10237179.0,146566.0,1118724.0,7493.0,11.18724
3478847,20220406,PASCO,PASCO,YANACANCHA,AG,0.0,FEMENINO,2022-02-13,190113.0,38522808.0,23960.0,125164.0,8438.0,1.25164
478945,20220406,LIMA,LIMA,LIMA,AG,56.0,FEMENINO,2022-02-13,150101.0,18381860.0,1606321.0,9674755.0,14954.0,96.74755
329330,20220406,LIMA,LIMA,JESUS MARIA,AG,30.0,FEMENINO,2022-02-13,150113.0,17298715.0,1606321.0,9674755.0,14954.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318682,20220406,AREQUIPA,AREQUIPA,MARIANO MELGAR,AG,7.0,FEMENINO,2022-04-03,40109.0,29270417.0,212768.0,1175765.0,14297.0,11.75765
1108682,20220406,ANCASH,POMABAMBA,POMABAMBA,AG,26.0,FEMENINO,2022-04-03,21601.0,26297858.0,126753.0,26675.0,10657.0,0.26675
181391,20220406,ANCASH,HUARAZ,HUARAZ,AG,24.0,FEMENINO,2022-04-03,20101.0,15236325.0,126753.0,185276.0,10657.0,1.85276
1923173,20220406,LIMA,LIMA,RIMAC,PCR,80.0,MASCULINO,2022-04-03,150128.0,1760415.0,1606321.0,9674755.0,14954.0,96.74755


In [22]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
2469064,20220406,LIMA,LIMA,SAN BORJA,AG,49.0,MASCULINO,2022-02-13,150130.0,5937772.0,1606321.0,9674755.0,14954.0,96.74755
2941365,20220406,LA LIBERTAD,TRUJILLO,FLORENCIA DE MORA,AG,4.0,MASCULINO,2022-02-13,130103.0,10237179.0,146566.0,1118724.0,7493.0,11.18724
3478847,20220406,PASCO,PASCO,YANACANCHA,AG,0.0,FEMENINO,2022-02-13,190113.0,38522808.0,23960.0,125164.0,8438.0,1.25164
478945,20220406,LIMA,LIMA,LIMA,AG,56.0,FEMENINO,2022-02-13,150101.0,18381860.0,1606321.0,9674755.0,14954.0,96.74755
329330,20220406,LIMA,LIMA,JESUS MARIA,AG,30.0,FEMENINO,2022-02-13,150113.0,17298715.0,1606321.0,9674755.0,14954.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318682,20220406,AREQUIPA,AREQUIPA,MARIANO MELGAR,AG,7.0,FEMENINO,2022-04-03,40109.0,29270417.0,212768.0,1175765.0,14297.0,11.75765
1108682,20220406,ANCASH,POMABAMBA,POMABAMBA,AG,26.0,FEMENINO,2022-04-03,21601.0,26297858.0,126753.0,26675.0,10657.0,0.26675
181391,20220406,ANCASH,HUARAZ,HUARAZ,AG,24.0,FEMENINO,2022-04-03,20101.0,15236325.0,126753.0,185276.0,10657.0,1.85276
1923173,20220406,LIMA,LIMA,RIMAC,PCR,80.0,MASCULINO,2022-04-03,150128.0,1760415.0,1606321.0,9674755.0,14954.0,96.74755


In [23]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            9
PROVINCIA            2689
DISTRITO             2689
METODODX_x              0
EDAD                    0
SEXO                    0
FECHA_RESULTADO         0
UBIGEO               2689
id_persona           2278
METODODX_y              9
POBLACION            2771
INDICE                  9
POBLACION_CIENMIL    2771
dtype: int64

In [24]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9741.0,2022-02-13,1.0,5.0,6.0
1,AYACUCHO,HUAMANGA,317801.0,3.17801,7311.0,2022-02-13,2.0,4.0,6.0
2,PIURA,PIURA,894847.0,8.94847,8045.0,2022-02-13,33.0,30.0,63.0
3,AYACUCHO,HUANTA,97205.0,0.97205,7311.0,2022-02-13,2.0,1.0,3.0
4,PIURA,PAITA,148289.0,1.48289,8045.0,2022-02-13,4.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...
1288,PUNO,PUNO,230219.0,2.30219,5378.0,2022-04-03,25.0,42.0,67.0
1289,CUSCO,ESPINAR,62059.0,0.62059,8384.0,2022-04-03,8.0,4.0,12.0
1290,ANCASH,SANTA,474053.0,4.74053,10657.0,2022-04-03,66.0,44.0,110.0
1291,CUSCO,LA CONVENCION,167910.0,1.67910,8384.0,2022-04-03,2.0,2.0,4.0


In [25]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-02-13,138,138,138,138,138,138,138,138
2022-02-20,190,190,190,190,190,190,190,190
2022-02-27,180,180,180,180,180,180,180,180
2022-03-06,174,174,174,174,174,174,174,174
2022-03-13,167,167,167,167,167,167,167,167
2022-03-20,161,161,161,161,161,161,161,161
2022-03-27,148,148,148,148,148,148,148,148
2022-04-03,135,135,135,135,135,135,135,135


In [26]:
cero_cases = weekly_sales['PROVINCIA']== "ATALAYA"
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
327,UCAYALI,ATALAYA,61049.0,0.61049,6606.0,2022-02-20,1.0,1.0,2.0


In [27]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9741.0,2022-02-13,1.181028,5.905140,7.086168
1,AYACUCHO,HUAMANGA,317801.0,3.17801,7311.0,2022-02-13,0.629325,1.258649,1.887974
2,PIURA,PIURA,894847.0,8.94847,8045.0,2022-02-13,3.687781,3.352528,7.040310
3,AYACUCHO,HUANTA,97205.0,0.97205,7311.0,2022-02-13,2.057507,1.028754,3.086261
4,PIURA,PAITA,148289.0,1.48289,8045.0,2022-02-13,2.697435,0.674359,3.371794
...,...,...,...,...,...,...,...,...,...
1288,PUNO,PUNO,230219.0,2.30219,5378.0,2022-04-03,10.859225,18.243499,29.102724
1289,CUSCO,ESPINAR,62059.0,0.62059,8384.0,2022-04-03,12.890959,6.445479,19.336438
1290,ANCASH,SANTA,474053.0,4.74053,10657.0,2022-04-03,13.922494,9.281663,23.204156
1291,CUSCO,LA CONVENCION,167910.0,1.67910,8384.0,2022-04-03,1.191114,1.191114,2.382229


In [28]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,9741.0,2022-02-13,1.0,6.0,7.0
1,AYACUCHO,HUAMANGA,317801.0,3.17801,7311.0,2022-02-13,1.0,1.0,2.0
2,PIURA,PIURA,894847.0,8.94847,8045.0,2022-02-13,4.0,3.0,7.0
3,AYACUCHO,HUANTA,97205.0,0.97205,7311.0,2022-02-13,2.0,1.0,3.0
4,PIURA,PAITA,148289.0,1.48289,8045.0,2022-02-13,3.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...
1288,PUNO,PUNO,230219.0,2.30219,5378.0,2022-04-03,11.0,18.0,29.0
1289,CUSCO,ESPINAR,62059.0,0.62059,8384.0,2022-04-03,13.0,6.0,19.0
1290,ANCASH,SANTA,474053.0,4.74053,10657.0,2022-04-03,14.0,9.0,23.0
1291,CUSCO,LA CONVENCION,167910.0,1.67910,8384.0,2022-04-03,1.0,1.0,2.0


In [29]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [30]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")