In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import urllib.request
from io import StringIO
import requests
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [2]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://files.minsa.gob.pe/s/eRqxR35ZCxrzNgr/download')
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0')
content = urlopen(req)

test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
0,20220612,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,20210419.0,150113.0,13866369.0
1,20220612,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,20210429.0,120114.0,13866556.0
2,20220612,LIMA,,,AG,39.0,FEMENINO,20210702.0,,13866581.0
3,20220612,CAJAMARCA,JAEN,PUCARA,PR,37.0,FEMENINO,20200630.0,60808.0,13866692.0
4,20220612,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,20210831.0,150113.0,13865937.0
...,...,...,...,...,...,...,...,...,...,...
3591810,20220612,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,20220126.0,150101.0,
3591811,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3591812,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,20220118.0,150108.0,
3591813,20220612,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,20220127.0,150101.0,


In [3]:
#content= "positivos_covid.csv"

#test = pd.read_csv(content, sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
#test

In [4]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
3591810,20220612,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,
3591811,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3591812,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,
3591813,20220612,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,
3591814,20220612,LIMA,LIMA,LIMA,PCR,33.0,FEMENINO,2022-01-26,150101.0,


In [5]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO          339
PROVINCIA          169854
DISTRITO           169854
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             169854
id_persona          64976
dtype: int64

In [6]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,44236,452125,9784
1,ANCASH,129244,1189403,10866
2,APURIMAC,39119,440629,8878
3,AREQUIPA,216682,1488247,14560
4,AYACUCHO,48251,658081,7332
5,CAJAMARCA,96278,1528904,6297
6,CALLAO,147594,1090990,13528
7,CUSCO,117548,1392648,8441
8,HUANCAVELICA,25829,414882,6226
9,HUANUCO,52270,823560,6347


In [7]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20220612,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,2021-04-19,150113.0,13866369.0,1633051.0,10741923.0,15203.0
1,20220612,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,2021-04-29,120114.0,13866556.0,126561.0,1340064.0,9444.0
2,20220612,LIMA,,,AG,39.0,FEMENINO,2021-07-02,,13866581.0,1633051.0,10741923.0,15203.0
3,20220612,CAJAMARCA,JAEN,PUCARA,PR,37.0,FEMENINO,2020-06-30,60808.0,13866692.0,96278.0,1528904.0,6297.0
4,20220612,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,2021-08-31,150113.0,13865937.0,1633051.0,10741923.0,15203.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591810,20220612,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1633051.0,10741923.0,15203.0
3591811,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1633051.0,10741923.0,15203.0
3591812,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1633051.0,10741923.0,15203.0
3591813,20220612,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1633051.0,10741923.0,15203.0


In [8]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [9]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [10]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [11]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [12]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [13]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [14]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
3009260,LIMA,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,1633051.0,9674755.0,15203.0


In [15]:
#find_provincia = test['PROVINCIA'] == 'UCAYALI'
find_distrito = test['DEPARTAMENTO'] == 'LORETO'
find_fecha = test['FECHA_RESULTADO'] == '2022-01-02'

data_exploratoria = test[find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
627425,20220612,LORETO,MAYNAS,SAN JUAN BAUTISTA,PR,32.0,FEMENINO,2022-01-02,160113.0,20925980.0,58013.0,550551.0,5291.0
789766,20220612,LORETO,MAYNAS,PUNCHANA,PR,31.0,MASCULINO,2022-01-02,160108.0,22425039.0,58013.0,550551.0,5291.0
1674501,20220612,LORETO,MAYNAS,IQUITOS,AG,24.0,MASCULINO,2022-01-02,160101.0,34702609.0,58013.0,550551.0,5291.0
1739718,20220612,LORETO,MAYNAS,PUNCHANA,AG,27.0,MASCULINO,2022-01-02,160108.0,19990922.0,58013.0,550551.0,5291.0
2170360,20220612,LORETO,MAYNAS,IQUITOS,AG,69.0,FEMENINO,2022-01-02,160101.0,3428514.0,58013.0,550551.0,5291.0


In [16]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
FEMENINO,2,2,2,2,2,2,2,2,2,2,2,2
MASCULINO,3,3,3,3,3,3,3,3,3,3,3,3


In [17]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [18]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,DEPARTAMENTO,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
2,LIMA,,,39.0,2021-07-02,,13866581.0,1633051.0,,15203.0
70,HUANUCO,,,52.0,2021-07-06,,13877907.0,52270.0,,6347.0
78,LIMA,,,26.0,2021-07-10,,13878571.0,1633051.0,,15203.0
134,LIMA,,,77.0,2022-02-08,,13925672.0,1633051.0,,15203.0
147,LIMA,,,68.0,2021-03-01,,13925493.0,1633051.0,,15203.0
...,...,...,...,...,...,...,...,...,...,...
3591687,LIMA,,,47.0,2021-06-03,,,1633051.0,,15203.0
3591688,LIMA,,,35.0,2021-06-09,,,1633051.0,,15203.0
3591689,LIMA,,,30.0,2021-05-30,,,1633051.0,,15203.0
3591746,CALLAO,,,17.0,2022-02-15,,,147594.0,,13528.0


In [19]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2021-04-19
1         2021-04-29
2         2021-07-02
3         2020-06-30
4         2021-08-31
             ...    
3591810   2022-01-26
3591811   2022-01-18
3591812   2022-01-18
3591813   2022-01-27
3591814   2022-01-26
Name: FECHA_RESULTADO, Length: 3591815, dtype: datetime64[ns]

In [20]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20220612,LIMA,LIMA,JESUS MARIA,AG,20.0,FEMENINO,2021-04-19,150113.0,13866369.0,1633051.0,9674755.0,15203.0,96.74755
1,20220612,JUNIN,HUANCAYO,EL TAMBO,AG,39.0,MASCULINO,2021-04-29,120114.0,13866556.0,126561.0,595183.0,9444.0,5.95183
2,20220612,LIMA,,,AG,39.0,FEMENINO,2021-07-02,,13866581.0,1633051.0,,15203.0,
3,20220612,CAJAMARCA,JAEN,PUCARA,PR,37.0,FEMENINO,2020-06-30,60808.0,13866692.0,96278.0,203724.0,6297.0,2.03724
4,20220612,LIMA,LIMA,JESUS MARIA,AG,53.0,FEMENINO,2021-08-31,150113.0,13865937.0,1633051.0,9674755.0,15203.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591810,20220612,LIMA,LIMA,LIMA,PCR,81.0,MASCULINO,2022-01-26,150101.0,,1633051.0,9674755.0,15203.0,96.74755
3591811,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1633051.0,9674755.0,15203.0,96.74755
3591812,20220612,LIMA,LIMA,CHORRILLOS,PCR,32.0,FEMENINO,2022-01-18,150108.0,,1633051.0,9674755.0,15203.0,96.74755
3591813,20220612,LIMA,LIMA,LIMA,PCR,51.0,FEMENINO,2022-01-27,150101.0,,1633051.0,9674755.0,15203.0,96.74755


In [21]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2022-04-24"
end_date = "2022-06-12"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1973702,20220612,LIMA,LIMA,SAN MIGUEL,AG,63.0,FEMENINO,2022-04-24,150136.0,1929629.0,1633051.0,9674755.0,15203.0,96.74755
1275570,20220612,LIMA,LIMA,SAN JUAN DE LURIGANCHO,PCR,28.0,FEMENINO,2022-04-24,150132.0,28452502.0,1633051.0,9674755.0,15203.0,96.74755
1372099,20220612,ANCASH,HUARAZ,HUARAZ,AG,48.0,MASCULINO,2022-04-24,20101.0,29980276.0,129244.0,185276.0,10866.0,1.85276
176346,20220612,LAMBAYEQUE,CHICLAYO,CHICLAYO,PCR,75.0,FEMENINO,2022-04-24,140101.0,15571834.0,105854.0,862709.0,7987.0,8.62709
1114302,20220612,LIMA,LIMA,ATE,PCR,25.0,MASCULINO,2022-04-24,150103.0,26397276.0,1633051.0,9674755.0,15203.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83777,20220612,LIMA,LIMA,SANTIAGO DE SURCO,AG,19.0,MASCULINO,2022-06-12,150140.0,14696291.0,1633051.0,9674755.0,15203.0,96.74755
3203592,20220612,LIMA,LIMA,SAN BORJA,PCR,49.0,MASCULINO,2022-06-12,150130.0,12602451.0,1633051.0,9674755.0,15203.0,96.74755
1316366,20220612,LIMA,LIMA,SAN MIGUEL,PCR,22.0,FEMENINO,2022-06-12,150136.0,28947689.0,1633051.0,9674755.0,15203.0,96.74755
2677083,20220612,LIMA,LIMA,MAGDALENA DEL MAR,PCR,35.0,MASCULINO,2022-06-12,150120.0,7319182.0,1633051.0,9674755.0,15203.0,96.74755


In [22]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1973702,20220612,LIMA,LIMA,SAN MIGUEL,AG,63.0,FEMENINO,2022-04-24,150136.0,1929629.0,1633051.0,9674755.0,15203.0,96.74755
1275570,20220612,LIMA,LIMA,SAN JUAN DE LURIGANCHO,PCR,28.0,FEMENINO,2022-04-24,150132.0,28452502.0,1633051.0,9674755.0,15203.0,96.74755
1372099,20220612,ANCASH,HUARAZ,HUARAZ,AG,48.0,MASCULINO,2022-04-24,20101.0,29980276.0,129244.0,185276.0,10866.0,1.85276
176346,20220612,LAMBAYEQUE,CHICLAYO,CHICLAYO,PCR,75.0,FEMENINO,2022-04-24,140101.0,15571834.0,105854.0,862709.0,7987.0,8.62709
1114302,20220612,LIMA,LIMA,ATE,PCR,25.0,MASCULINO,2022-04-24,150103.0,26397276.0,1633051.0,9674755.0,15203.0,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83777,20220612,LIMA,LIMA,SANTIAGO DE SURCO,AG,19.0,MASCULINO,2022-06-12,150140.0,14696291.0,1633051.0,9674755.0,15203.0,96.74755
3203592,20220612,LIMA,LIMA,SAN BORJA,PCR,49.0,MASCULINO,2022-06-12,150130.0,12602451.0,1633051.0,9674755.0,15203.0,96.74755
1316366,20220612,LIMA,LIMA,SAN MIGUEL,PCR,22.0,FEMENINO,2022-06-12,150136.0,28947689.0,1633051.0,9674755.0,15203.0,96.74755
2677083,20220612,LIMA,LIMA,MAGDALENA DEL MAR,PCR,35.0,MASCULINO,2022-06-12,150120.0,7319182.0,1633051.0,9674755.0,15203.0,96.74755


In [23]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO           18
PROVINCIA             853
DISTRITO              853
METODODX_x              0
EDAD                    0
SEXO                    0
FECHA_RESULTADO         0
UBIGEO                853
id_persona           1201
METODODX_y             18
POBLACION             866
INDICE                 18
POBLACION_CIENMIL     866
dtype: int64

In [24]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,PUNO,MELGAR,69693.0,0.69693,5438.0,2022-04-24,0.0,1.0,1.0
1,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-04-24,1.0,0.0,1.0
2,LIMA,HUAROCHIRI,62381.0,0.62381,15203.0,2022-04-24,0.0,1.0,1.0
3,ANCASH,HUARMEY,33066.0,0.33066,10866.0,2022-04-24,1.0,0.0,1.0
4,CAJAMARCA,CAJAMARCA,388170.0,3.88170,6297.0,2022-04-24,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
673,LA LIBERTAD,ASCOPE,123480.0,1.23480,7525.0,2022-06-12,1.0,0.0,1.0
674,ANCASH,HUARAZ,185276.0,1.85276,10866.0,2022-06-12,64.0,39.0,103.0
675,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-06-12,2.0,3.0,5.0
676,PUNO,HUANCANE,58957.0,0.58957,5438.0,2022-06-12,0.0,1.0,1.0


In [25]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-04-24,21,21,21,21,21,21,21,21
2022-05-01,100,100,100,100,100,100,100,100
2022-05-08,86,86,86,86,86,86,86,86
2022-05-15,91,91,91,91,91,91,91,91
2022-05-22,91,91,91,91,91,91,91,91
2022-05-29,87,87,87,87,87,87,87,87
2022-06-05,100,100,100,100,100,100,100,100
2022-06-12,102,102,102,102,102,102,102,102


In [26]:
cero_cases = weekly_sales['PROVINCIA']== "ATALAYA"
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL


In [27]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,PUNO,MELGAR,69693.0,0.69693,5438.0,2022-04-24,0.000000,1.434864,1.434864
1,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-04-24,1.134919,0.000000,1.134919
2,LIMA,HUAROCHIRI,62381.0,0.62381,15203.0,2022-04-24,0.000000,1.603052,1.603052
3,ANCASH,HUARMEY,33066.0,0.33066,10866.0,2022-04-24,3.024255,0.000000,3.024255
4,CAJAMARCA,CAJAMARCA,388170.0,3.88170,6297.0,2022-04-24,0.257619,0.257619,0.515238
...,...,...,...,...,...,...,...,...,...
673,LA LIBERTAD,ASCOPE,123480.0,1.23480,7525.0,2022-06-12,0.809848,0.000000,0.809848
674,ANCASH,HUARAZ,185276.0,1.85276,10866.0,2022-06-12,34.543060,21.049677,55.592737
675,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-06-12,2.269838,3.404758,5.674596
676,PUNO,HUANCANE,58957.0,0.58957,5438.0,2022-06-12,0.000000,1.696151,1.696151


In [28]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,PUNO,MELGAR,69693.0,0.69693,5438.0,2022-04-24,0.0,1.0,1.0
1,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-04-24,1.0,0.0,1.0
2,LIMA,HUAROCHIRI,62381.0,0.62381,15203.0,2022-04-24,0.0,2.0,2.0
3,ANCASH,HUARMEY,33066.0,0.33066,10866.0,2022-04-24,3.0,0.0,3.0
4,CAJAMARCA,CAJAMARCA,388170.0,3.88170,6297.0,2022-04-24,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
673,LA LIBERTAD,ASCOPE,123480.0,1.23480,7525.0,2022-06-12,1.0,0.0,1.0
674,ANCASH,HUARAZ,185276.0,1.85276,10866.0,2022-06-12,35.0,21.0,56.0
675,PUNO,CHUCUITO,88112.0,0.88112,5438.0,2022-06-12,2.0,3.0,5.0
676,PUNO,HUANCANE,58957.0,0.58957,5438.0,2022-06-12,0.0,2.0,2.0


In [29]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [30]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")