In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
#import missingno as msno

# matplotlib
from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# scipy specifics
from scipy import stats as sps
from scipy.interpolate import interp1d

In [3]:
try:
    from urllib.request import Request, urlopen  # Python 3
except ImportError:
    from urllib2 import Request, urlopen  # Python 2

req = Request('https://cloud.minsa.gob.pe/s/AC2adyLkHCKjmfm/download')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

test = pd.read_csv(content , sep=";", na_values='EN INVESTIGACIÓN', encoding='utf-8')
print(test)

         FECHA_CORTE DEPARTAMENTO PROVINCIA           DISTRITO METODODX  EDAD  \
0           20210823   LAMBAYEQUE  CHICLAYO           CHICLAYO       PR  28.0   
1           20210823     AREQUIPA  AREQUIPA     CERRO COLORADO      PCR  28.0   
2           20210823         PUNO      PUNO               PUNO      PCR  27.0   
3           20210823   SAN MARTIN     RIOJA             AWAJUN       PR  32.0   
4           20210823         LIMA      LIMA         CHORRILLOS      PCR  28.0   
...              ...          ...       ...                ...      ...   ...   
2143686     20210823         LIMA    HUARAL            CHANCAY      PCR   NaN   
2143687     20210823        JUNIN  HUANCAYO             CHILCA       PR  24.0   
2143688     20210823        PIURA    TALARA            MANCORA       AG  26.0   
2143689     20210823         LIMA      LIMA  VILLA EL SALVADOR       AG  43.0   
2143690     20210823         LIMA      LIMA               LIMA      PCR  44.0   

              SEXO  FECHA_R

In [4]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'].astype(str), format='%Y%m%d')
test.tail()

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona
2143686,20210823,LIMA,HUARAL,CHANCAY,PCR,,FEMENINO,2021-01-10,150605.0,
2143687,20210823,JUNIN,HUANCAYO,CHILCA,PR,24.0,FEMENINO,2020-08-14,120107.0,
2143688,20210823,PIURA,TALARA,MANCORA,AG,26.0,FEMENINO,2021-03-23,200706.0,
2143689,20210823,LIMA,LIMA,VILLA EL SALVADOR,AG,43.0,MASCULINO,2021-03-30,150142.0,
2143690,20210823,LIMA,LIMA,LIMA,PCR,44.0,FEMENINO,2020-05-15,150101.0,


In [5]:
test.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA          107707
DISTRITO           107707
METODODX                0
EDAD                  347
SEXO                    1
FECHA_RESULTADO      2023
UBIGEO             107707
id_persona          35735
dtype: int64

In [6]:
indice_departamento = pd.read_csv('https://raw.githubusercontent.com/annaabsi/git-scraper-covid19/main/resultados/positivos_por_departamentos.csv')
indice_departamento

Unnamed: 0,DEPARTAMENTO,METODODX,POBLACION,INDICE
0,AMAZONAS,31112,426806,7289
1,ANCASH,75727,1180638,6414
2,APURIMAC,25775,430736,5984
3,AREQUIPA,112997,1497438,7546
4,AYACUCHO,32768,668213,4904
5,CAJAMARCA,64767,1453711,4455
6,CALLAO,99236,1129854,8783
7,CUSCO,71810,1357075,5292
8,HUANCAVELICA,15918,365317,4357
9,HUANUCO,35109,760267,4618


In [7]:
test = pd.merge(test, indice_departamento,
                        how="left", on=["DEPARTAMENTO"])
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
0,20210823,LAMBAYEQUE,CHICLAYO,CHICLAYO,PR,28.0,FEMENINO,2021-06-04,140101.0,24654388.0,60538,1310785,4618
1,20210823,AREQUIPA,AREQUIPA,CERRO COLORADO,PCR,28.0,MASCULINO,2020-10-06,40104.0,24654406.0,112997,1497438,7546
2,20210823,PUNO,PUNO,PUNO,PCR,27.0,FEMENINO,2020-06-26,210101.0,24654422.0,38347,1237997,3098
3,20210823,SAN MARTIN,RIOJA,AWAJUN,PR,32.0,FEMENINO,2020-10-28,220802.0,24654429.0,48065,899648,5343
4,20210823,LIMA,LIMA,CHORRILLOS,PCR,28.0,MASCULINO,2021-04-22,150108.0,24654438.0,935243,10628470,8799
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2143686,20210823,LIMA,HUARAL,CHANCAY,PCR,,FEMENINO,2021-01-10,150605.0,,935243,10628470,8799
2143687,20210823,JUNIN,HUANCAYO,CHILCA,PR,24.0,FEMENINO,2020-08-14,120107.0,,83585,1361467,6139
2143688,20210823,PIURA,TALARA,MANCORA,AG,26.0,FEMENINO,2021-03-23,200706.0,,85613,2047954,4180
2143689,20210823,LIMA,LIMA,VILLA EL SALVADOR,AG,43.0,MASCULINO,2021-03-30,150142.0,,935243,10628470,8799


In [8]:
poblacion_csv = pd.read_csv('poblacion_provincia.csv')
poblacion_csv

Unnamed: 0,UBIGEO,PROVINCIA,POBLACION
0,10100,CHACHAPOYAS,63188
1,10200,BAGUA,84672
2,10300,BONGARA,26830
3,10400,CONDORCANQUI,51344
4,10500,LUYA,47827
...,...,...,...
191,240300,ZARUMILLA,56038
192,250100,CORONEL PORTILLO,447733
193,250200,ATALAYA,61049
194,250300,PADRE ABAD,77044


In [9]:
poblacion_dict = poblacion_csv.to_dict('split')
poblacion_dict['data']

[[10100, 'CHACHAPOYAS', 63188],
 [10200, 'BAGUA', 84672],
 [10300, 'BONGARA', 26830],
 [10400, 'CONDORCANQUI', 51344],
 [10500, 'LUYA', 47827],
 [10600, 'RODRIGUEZ DE MENDOZA', 33651],
 [10700, 'UTCUBAMBA', 119294],
 [20100, 'HUARAZ', 185276],
 [20200, 'AIJA', 6433],
 [20300, 'ANTONIO RAYMONDI', 13950],
 [20400, 'ASUNCION', 7710],
 [20500, 'BOLOGNESI', 24012],
 [20600, 'CARHUAZ', 50007],
 [20700, 'CARLOS FERMIN FITZCARRALD', 18496],
 [20800, 'CASMA', 57256],
 [20900, 'CORONGO', 8017],
 [21000, 'HUARI', 63264],
 [21100, 'HUARMEY', 33066],
 [21200, 'HUAYLAS', 56557],
 [21300, 'MARISCAL LUZURIAGA', 21787],
 [21400, 'OCROS', 7224],
 [21500, 'PALLASCA', 24371],
 [21600, 'POMABAMBA', 26675],
 [21700, 'RECUAY', 18085],
 [21800, 'SANTA', 474053],
 [21900, 'SIHUAS', 28630],
 [22000, 'YUNGAY', 55769],
 [30100, 'ABANCAY', 120116],
 [30200, 'ANDAHUAYLAS', 150758],
 [30300, 'ANTABAMBA', 11781],
 [30400, 'AYMARAES', 24570],
 [30500, 'COTABAMBAS', 55208],
 [30600, 'CHINCHEROS', 46544],
 [30700, 'GRAU

In [10]:
poblacion =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][2]
    poblacion.append(array)
    
poblacion

[63188,
 84672,
 26830,
 51344,
 47827,
 33651,
 119294,
 185276,
 6433,
 13950,
 7710,
 24012,
 50007,
 18496,
 57256,
 8017,
 63264,
 33066,
 56557,
 21787,
 7224,
 24371,
 26675,
 18085,
 474053,
 28630,
 55769,
 120116,
 150758,
 11781,
 24570,
 55208,
 46544,
 21759,
 1175765,
 61708,
 43690,
 34743,
 97458,
 16426,
 54851,
 12797,
 317801,
 32482,
 8341,
 97205,
 75277,
 51838,
 29139,
 9909,
 9292,
 19866,
 17063,
 388170,
 83167,
 83916,
 151714,
 29357,
 123948,
 83913,
 203724,
 145770,
 51678,
 47114,
 22638,
 38602,
 1129854,
 511019,
 24000,
 63131,
 71582,
 34754,
 106476,
 70143,
 62059,
 167910,
 26644,
 47579,
 101735,
 70043,
 121265,
 37503,
 53901,
 14588,
 33883,
 18182,
 85995,
 315799,
 53247,
 32427,
 16372,
 52095,
 138275,
 29160,
 50086,
 36987,
 17114,
 18705,
 445752,
 262110,
 78472,
 14832,
 174016,
 595183,
 59138,
 167385,
 88405,
 22757,
 239105,
 91849,
 40041,
 57604,
 1118724,
 123480,
 15982,
 86411,
 30987,
 85091,
 112970,
 85092,
 168670,
 55868

In [11]:
provincia =[]
for x in range(len(poblacion_dict['data'])):
    array = poblacion_dict['data'][x][1]
    provincia.append(array)
    
provincia

['CHACHAPOYAS',
 'BAGUA',
 'BONGARA',
 'CONDORCANQUI',
 'LUYA',
 'RODRIGUEZ DE MENDOZA',
 'UTCUBAMBA',
 'HUARAZ',
 'AIJA',
 'ANTONIO RAYMONDI',
 'ASUNCION',
 'BOLOGNESI',
 'CARHUAZ',
 'CARLOS FERMIN FITZCARRALD',
 'CASMA',
 'CORONGO',
 'HUARI',
 'HUARMEY',
 'HUAYLAS',
 'MARISCAL LUZURIAGA',
 'OCROS',
 'PALLASCA',
 'POMABAMBA',
 'RECUAY',
 'SANTA',
 'SIHUAS',
 'YUNGAY',
 'ABANCAY',
 'ANDAHUAYLAS',
 'ANTABAMBA',
 'AYMARAES',
 'COTABAMBAS',
 'CHINCHEROS',
 'GRAU',
 'AREQUIPA',
 'CAMANA',
 'CARAVELI',
 'CASTILLA',
 'CAYLLOMA',
 'CONDESUYOS',
 'ISLAY',
 'LA UNION',
 'HUAMANGA',
 'CANGALLO',
 'HUANCA SANCOS',
 'HUANTA',
 'LA MAR',
 'LUCANAS',
 'PARINACOCHAS',
 'PAUCAR DEL SARA SARA',
 'SUCRE',
 'VICTOR FAJARDO',
 'VILCAS HUAMAN',
 'CAJAMARCA',
 'CAJABAMBA',
 'CELENDIN',
 'CHOTA',
 'CONTUMAZA',
 'CUTERVO',
 'HUALGAYOC',
 'JAEN',
 'SAN IGNACIO',
 'SAN MARCOS',
 'SAN MIGUEL',
 'SAN PABLO',
 'SANTA CRUZ',
 'CALLAO',
 'CUSCO',
 'ACOMAYO',
 'ANTA',
 'CALCA',
 'CANAS',
 'CANCHIS',
 'CHUMBIVILCAS',


In [12]:
res = {provincia[i]: poblacion[i] for i in range(len(provincia))}
res

{'CHACHAPOYAS': 63188,
 'BAGUA': 84672,
 'BONGARA': 26830,
 'CONDORCANQUI': 51344,
 'LUYA': 47827,
 'RODRIGUEZ DE MENDOZA': 33651,
 'UTCUBAMBA': 119294,
 'HUARAZ': 185276,
 'AIJA': 6433,
 'ANTONIO RAYMONDI': 13950,
 'ASUNCION': 7710,
 'BOLOGNESI': 24012,
 'CARHUAZ': 50007,
 'CARLOS FERMIN FITZCARRALD': 18496,
 'CASMA': 57256,
 'CORONGO': 8017,
 'HUARI': 63264,
 'HUARMEY': 33066,
 'HUAYLAS': 56557,
 'MARISCAL LUZURIAGA': 21787,
 'OCROS': 7224,
 'PALLASCA': 24371,
 'POMABAMBA': 26675,
 'RECUAY': 18085,
 'SANTA': 474053,
 'SIHUAS': 28630,
 'YUNGAY': 55769,
 'ABANCAY': 120116,
 'ANDAHUAYLAS': 150758,
 'ANTABAMBA': 11781,
 'AYMARAES': 24570,
 'COTABAMBAS': 55208,
 'CHINCHEROS': 46544,
 'GRAU': 21759,
 'AREQUIPA': 1175765,
 'CAMANA': 61708,
 'CARAVELI': 43690,
 'CASTILLA': 34743,
 'CAYLLOMA': 97458,
 'CONDESUYOS': 16426,
 'ISLAY': 54851,
 'LA UNION': 12797,
 'HUAMANGA': 317801,
 'CANGALLO': 32482,
 'HUANCA SANCOS': 8341,
 'HUANTA': 97205,
 'LA MAR': 75277,
 'LUCANAS': 51838,
 'PARINACOCHAS':

In [13]:
test['POBLACION'] = test['PROVINCIA'].map(res)

In [14]:
null_columns = test.columns[test.isnull().any()]
test[test["SEXO"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
1577655,LIMA,LIMA,0.0,,2021-03-13,150101.0,10881464.0,9674755.0


In [15]:
find_provincia = test['PROVINCIA'] == 'LIMA'
find_distrito = test['DISTRITO'] == 'LIMA'
find_fecha = test['FECHA_RESULTADO'] == '2021-03-13'

data_exploratoria = test[find_provincia & find_distrito & find_fecha]
data_exploratoria

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
4689,20210823,LIMA,LIMA,LIMA,AG,32.0,MASCULINO,2021-03-13,150101.0,24632508.0,935243,9674755.0,8799
22959,20210823,LIMA,LIMA,LIMA,AG,25.0,FEMENINO,2021-03-13,150101.0,25824674.0,935243,9674755.0,8799
23591,20210823,LIMA,LIMA,LIMA,AG,37.0,MASCULINO,2021-03-13,150101.0,25323679.0,935243,9674755.0,8799
34711,20210823,LIMA,LIMA,LIMA,AG,35.0,FEMENINO,2021-03-13,150101.0,19505698.0,935243,9674755.0,8799
41998,20210823,LIMA,LIMA,LIMA,AG,30.0,FEMENINO,2021-03-13,150101.0,19628861.0,935243,9674755.0,8799
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082044,20210823,LIMA,LIMA,LIMA,AG,39.0,MASCULINO,2021-03-13,150101.0,18910640.0,935243,9674755.0,8799
2110457,20210823,LIMA,LIMA,LIMA,PCR,0.0,FEMENINO,2021-03-13,150101.0,,935243,9674755.0,8799
2115212,20210823,LIMA,LIMA,LIMA,PCR,38.0,FEMENINO,2021-03-13,150101.0,,935243,9674755.0,8799
2120313,20210823,LIMA,LIMA,LIMA,PCR,25.0,MASCULINO,2021-03-13,150101.0,,935243,9674755.0,8799


In [16]:
data_exploratoria.groupby("SEXO").count()

Unnamed: 0_level_0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
FEMENINO,88,88,88,88,88,88,88,88,85,88,88,88
MASCULINO,87,87,87,87,87,87,87,87,86,87,87,87


In [17]:
test['SEXO'] = test['SEXO'].fillna('FEMENINO')

In [18]:
null_columns = test.columns[test.isnull().any()]
test[test["PROVINCIA"].isnull()][null_columns]

Unnamed: 0,PROVINCIA,DISTRITO,EDAD,FECHA_RESULTADO,UBIGEO,id_persona,POBLACION
25,,,42.0,2021-01-21,,25651572.0,
51,,,29.0,2021-04-07,,24853561.0,
65,,,27.0,2021-02-26,,24769312.0,
96,,,35.0,2021-02-06,,24662423.0,
101,,,20.0,2020-07-07,,24653321.0,
...,...,...,...,...,...,...,...
2143599,,,24.0,2021-04-23,,,
2143604,,,42.0,2021-02-20,,,
2143606,,,45.0,2021-03-25,,,
2143656,,,46.0,2021-02-13,,,


In [19]:
test['FECHA_RESULTADO'] = pd.to_datetime(test['FECHA_RESULTADO'], errors='coerce', dayfirst=True)
test['FECHA_RESULTADO']

0         2021-06-04
1         2020-10-06
2         2020-06-26
3         2020-10-28
4         2021-04-22
             ...    
2143686   2021-01-10
2143687   2020-08-14
2143688   2021-03-23
2143689   2021-03-30
2143690   2020-05-15
Name: FECHA_RESULTADO, Length: 2143691, dtype: datetime64[ns]

In [20]:
test['POBLACION_CIENMIL'] = test['POBLACION']/100000

#test = test.drop(labels="POBLACION_CIENMIL", axis=1)
test

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
0,20210823,LAMBAYEQUE,CHICLAYO,CHICLAYO,PR,28.0,FEMENINO,2021-06-04,140101.0,24654388.0,60538,862709.0,4618,8.62709
1,20210823,AREQUIPA,AREQUIPA,CERRO COLORADO,PCR,28.0,MASCULINO,2020-10-06,40104.0,24654406.0,112997,1175765.0,7546,11.75765
2,20210823,PUNO,PUNO,PUNO,PCR,27.0,FEMENINO,2020-06-26,210101.0,24654422.0,38347,230219.0,3098,2.30219
3,20210823,SAN MARTIN,RIOJA,AWAJUN,PR,32.0,FEMENINO,2020-10-28,220802.0,24654429.0,48065,131651.0,5343,1.31651
4,20210823,LIMA,LIMA,CHORRILLOS,PCR,28.0,MASCULINO,2021-04-22,150108.0,24654438.0,935243,9674755.0,8799,96.74755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2143686,20210823,LIMA,HUARAL,CHANCAY,PCR,,FEMENINO,2021-01-10,150605.0,,935243,194375.0,8799,1.94375
2143687,20210823,JUNIN,HUANCAYO,CHILCA,PR,24.0,FEMENINO,2020-08-14,120107.0,,83585,595183.0,6139,5.95183
2143688,20210823,PIURA,TALARA,MANCORA,AG,26.0,FEMENINO,2021-03-23,200706.0,,85613,154268.0,4180,1.54268
2143689,20210823,LIMA,LIMA,VILLA EL SALVADOR,AG,43.0,MASCULINO,2021-03-30,150142.0,,935243,9674755.0,8799,96.74755


In [84]:
salidasxsemanas = test.sort_values(by = 'FECHA_RESULTADO')
start_date = "2021-06-28"
end_date = "2021-08-22"

after_start_date = salidasxsemanas["FECHA_RESULTADO"] >= start_date
before_end_date = salidasxsemanas["FECHA_RESULTADO"] <= end_date
between_two_dates = after_start_date & before_end_date

filtered_dates = salidasxsemanas.loc[between_two_dates]

filtered_dates

Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1806767,20210823,LAMBAYEQUE,CHICLAYO,CHICLAYO,AG,19.0,MASCULINO,2021-06-28,140101.0,14150339.0,60538,862709.0,4618,8.62709
507948,20210823,PUNO,PUNO,PUNO,AG,16.0,MASCULINO,2021-06-28,210101.0,30727642.0,38347,230219.0,3098,2.30219
308747,20210823,JUNIN,HUANCAYO,SAN AGUSTIN,AG,30.0,MASCULINO,2021-06-28,120129.0,24424956.0,83585,595183.0,6139,5.95183
1443156,20210823,LIMA,LIMA,MAGDALENA DEL MAR,PCR,48.0,MASCULINO,2021-06-28,150120.0,8795323.0,935243,9674755.0,8799,96.74755
102086,20210823,TACNA,TACNA,CIUDAD NUEVA,AG,43.0,MASCULINO,2021-06-28,230104.0,20665543.0,29499,346192.0,7952,3.46192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2083482,20210823,CUSCO,CHUMBIVILCAS,SANTO TOMAS,AG,38.0,FEMENINO,2021-08-22,80701.0,18848231.0,71810,70143.0,5292,0.70143
526293,20210823,HUANCAVELICA,HUANCAVELICA,ASCENSION,AG,18.0,FEMENINO,2021-08-22,90118.0,31723217.0,15918,121265.0,4357,1.21265
857451,20210823,MOQUEGUA,MARISCAL NIETO,SAMEGUA,AG,52.0,FEMENINO,2021-08-22,180104.0,1226654.0,29161,95551.0,15130,0.95551
1853582,20210823,LIMA,LIMA,SANTA ANITA,PCR,23.0,FEMENINO,2021-08-22,150137.0,15147742.0,935243,9674755.0,8799,96.74755


In [85]:
filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)
filtered_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dates['PROVINCIA'] = filtered_dates['PROVINCIA'].replace('EN INVESTIGACIÃN', np.nan)


Unnamed: 0,FECHA_CORTE,DEPARTAMENTO,PROVINCIA,DISTRITO,METODODX_x,EDAD,SEXO,FECHA_RESULTADO,UBIGEO,id_persona,METODODX_y,POBLACION,INDICE,POBLACION_CIENMIL
1806767,20210823,LAMBAYEQUE,CHICLAYO,CHICLAYO,AG,19.0,MASCULINO,2021-06-28,140101.0,14150339.0,60538,862709.0,4618,8.62709
507948,20210823,PUNO,PUNO,PUNO,AG,16.0,MASCULINO,2021-06-28,210101.0,30727642.0,38347,230219.0,3098,2.30219
308747,20210823,JUNIN,HUANCAYO,SAN AGUSTIN,AG,30.0,MASCULINO,2021-06-28,120129.0,24424956.0,83585,595183.0,6139,5.95183
1443156,20210823,LIMA,LIMA,MAGDALENA DEL MAR,PCR,48.0,MASCULINO,2021-06-28,150120.0,8795323.0,935243,9674755.0,8799,96.74755
102086,20210823,TACNA,TACNA,CIUDAD NUEVA,AG,43.0,MASCULINO,2021-06-28,230104.0,20665543.0,29499,346192.0,7952,3.46192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2083482,20210823,CUSCO,CHUMBIVILCAS,SANTO TOMAS,AG,38.0,FEMENINO,2021-08-22,80701.0,18848231.0,71810,70143.0,5292,0.70143
526293,20210823,HUANCAVELICA,HUANCAVELICA,ASCENSION,AG,18.0,FEMENINO,2021-08-22,90118.0,31723217.0,15918,121265.0,4357,1.21265
857451,20210823,MOQUEGUA,MARISCAL NIETO,SAMEGUA,AG,52.0,FEMENINO,2021-08-22,180104.0,1226654.0,29161,95551.0,15130,0.95551
1853582,20210823,LIMA,LIMA,SANTA ANITA,PCR,23.0,FEMENINO,2021-08-22,150137.0,15147742.0,935243,9674755.0,8799,96.74755


In [86]:
filtered_dates.isnull().sum()

FECHA_CORTE             0
DEPARTAMENTO            0
PROVINCIA            2220
DISTRITO             2220
METODODX_x              0
EDAD                    2
SEXO                    0
FECHA_RESULTADO         0
UBIGEO               2220
id_persona           1613
METODODX_y              0
POBLACION            2409
INDICE                  0
POBLACION_CIENMIL    2409
dtype: int64

In [87]:
weekly_sales = filtered_dates.groupby(["SEXO","DEPARTAMENTO","PROVINCIA", "POBLACION", "POBLACION_CIENMIL", "INDICE", pd.Grouper(key="FECHA_RESULTADO",freq="W-SUN")]).size()
weekly_sales = weekly_sales.unstack(0).fillna(0)
weekly_sales.sort_values(by=['FECHA_RESULTADO'], inplace=True, ascending=True)
weekly_sales.loc[:,'TOTAL'] = weekly_sales.sum(numeric_only=True, axis=1)

weekly_sales = weekly_sales.reset_index()
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7289,2021-07-04,31.0,27.0,58.0
1,HUANCAVELICA,ANGARAES,53901.0,0.53901,4357,2021-07-04,1.0,2.0,3.0
2,PIURA,PIURA,894847.0,8.94847,4180,2021-07-04,310.0,326.0,636.0
3,SAN MARTIN,TOCACHE,76450.0,0.76450,5343,2021-07-04,6.0,3.0,9.0
4,ANCASH,YUNGAY,55769.0,0.55769,6414,2021-07-04,15.0,12.0,27.0
...,...,...,...,...,...,...,...,...,...
1426,SAN MARTIN,BELLAVISTA,60893.0,0.60893,5343,2021-08-22,0.0,2.0,2.0
1427,HUANUCO,PUERTO INCA,36987.0,0.36987,4618,2021-08-22,8.0,3.0,11.0
1428,AYACUCHO,CANGALLO,32482.0,0.32482,4904,2021-08-22,1.0,1.0,2.0
1429,AYACUCHO,LA MAR,75277.0,0.75277,4904,2021-08-22,23.0,15.0,38.0


In [88]:
weekly_sales.groupby("FECHA_RESULTADO").count()

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FEMENINO,MASCULINO,TOTAL
FECHA_RESULTADO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-04,183,183,183,183,183,183,183,183
2021-07-11,183,183,183,183,183,183,183,183
2021-07-18,184,184,184,184,184,184,184,184
2021-07-25,180,180,180,180,180,180,180,180
2021-08-01,179,179,179,179,179,179,179,179
2021-08-08,177,177,177,177,177,177,177,177
2021-08-15,176,176,176,176,176,176,176,176
2021-08-22,169,169,169,169,169,169,169,169


In [89]:
cero_cases = weekly_sales['TOTAL']==0
data_cero = weekly_sales[cero_cases]
data_cero

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL


In [90]:
weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']] = weekly_sales[['FEMENINO','MASCULINO', 'TOTAL']].div(weekly_sales['POBLACION_CIENMIL'].values,axis=0)
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7289,2021-07-04,36.611867,31.887755,68.499622
1,HUANCAVELICA,ANGARAES,53901.0,0.53901,4357,2021-07-04,1.855253,3.710506,5.565759
2,PIURA,PIURA,894847.0,8.94847,4180,2021-07-04,34.642794,36.430809,71.073603
3,SAN MARTIN,TOCACHE,76450.0,0.76450,5343,2021-07-04,7.848267,3.924133,11.772400
4,ANCASH,YUNGAY,55769.0,0.55769,6414,2021-07-04,26.896663,21.517330,48.413993
...,...,...,...,...,...,...,...,...,...
1426,SAN MARTIN,BELLAVISTA,60893.0,0.60893,5343,2021-08-22,0.000000,3.284450,3.284450
1427,HUANUCO,PUERTO INCA,36987.0,0.36987,4618,2021-08-22,21.629221,8.110958,29.740179
1428,AYACUCHO,CANGALLO,32482.0,0.32482,4904,2021-08-22,3.078628,3.078628,6.157256
1429,AYACUCHO,LA MAR,75277.0,0.75277,4904,2021-08-22,30.553821,19.926405,50.480226


In [91]:
weekly_sales.FEMENINO = weekly_sales.FEMENINO.round()
weekly_sales.MASCULINO = weekly_sales.MASCULINO.round()
weekly_sales.TOTAL = weekly_sales.FEMENINO + weekly_sales.MASCULINO
weekly_sales.FECHA_RESULTADO = weekly_sales.FECHA_RESULTADO.dt.strftime('%Y-%m-%d')
weekly_sales

SEXO,DEPARTAMENTO,PROVINCIA,POBLACION,POBLACION_CIENMIL,INDICE,FECHA_RESULTADO,FEMENINO,MASCULINO,TOTAL
0,AMAZONAS,BAGUA,84672.0,0.84672,7289,2021-07-04,37.0,32.0,69.0
1,HUANCAVELICA,ANGARAES,53901.0,0.53901,4357,2021-07-04,2.0,4.0,6.0
2,PIURA,PIURA,894847.0,8.94847,4180,2021-07-04,35.0,36.0,71.0
3,SAN MARTIN,TOCACHE,76450.0,0.76450,5343,2021-07-04,8.0,4.0,12.0
4,ANCASH,YUNGAY,55769.0,0.55769,6414,2021-07-04,27.0,22.0,49.0
...,...,...,...,...,...,...,...,...,...
1426,SAN MARTIN,BELLAVISTA,60893.0,0.60893,5343,2021-08-22,0.0,3.0,3.0
1427,HUANUCO,PUERTO INCA,36987.0,0.36987,4618,2021-08-22,22.0,8.0,30.0
1428,AYACUCHO,CANGALLO,32482.0,0.32482,4904,2021-08-22,3.0,3.0,6.0
1429,AYACUCHO,LA MAR,75277.0,0.75277,4904,2021-08-22,31.0,20.0,51.0


In [92]:
weekly_sales.to_csv('dataset_covid_total.csv' , index=False)

In [93]:
weekly_sales.to_json('dataset_covid_total.json', orient="table")