# SARS Data
23 March 2020

Cecilia García López de Munain

## Packages

In [1]:
import os
import codecs 
import json
import pandas as pd

## Parameters

Source: https://github.com/aute/SARS_data/blob/master/Infected%20InChina.json

In [2]:
PATH_INPUT_DATA = 'data\\input'
PATH_OUTPUT_DATA = 'data\\interim'

INPUT_DATA_NAME = 'Infected InChina.json'
OUTPUT_DATA_NAME = 'sars_data.csv'

## Import

In [3]:
with codecs.open(os.path.join(PATH_INPUT_DATA, INPUT_DATA_NAME), 'r', 'utf-8') as data_file:
  data = json.load(data_file)

## Process Data

In [4]:
sars_data = pd.DataFrame(columns=['date', 'city', 'cumulative','cured','deaths','uncertain'])

for i, date in enumerate(data):
    for register in data.get(date):
        sars_data = sars_data.append({
            'date': date,
            'city': register.get('city'),
            'cumulative': register.get('cumulative'),
            'cured': register.get('cured'),
            'deaths': register.get('deaths'),
            'uncertain': register.get('uncertain'),
                         }, ignore_index=True)
sars_data.loc[:,'date'] = pd.to_datetime(sars_data.date)
sars_data.set_index('date', inplace=True)

sars_data.head()

Unnamed: 0_level_0,city,cumulative,cured,deaths,uncertain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003-04-21,广东,1317,1136.0,48,
2003-04-21,北京,482,43.0,25,610.0
2003-04-21,山西,120,6.0,7,61.0
2003-04-21,内蒙古,30,,6,48.0
2003-04-21,四川,8,3.0,2,7.0


In [5]:
dict_cities = {v: k for k, v in enumerate(list(sars_data.city.unique()))}
dict_cities

{'广东': 0,
 '北京': 1,
 '山西': 2,
 '内蒙古': 3,
 '四川': 4,
 '广西': 5,
 '河南': 6,
 '宁夏': 7,
 '吉林': 8,
 '浙江': 9,
 '辽宁': 10,
 '甘肃': 11,
 '陕西': 12,
 '湖南': 13,
 '上海': 14,
 '河北': 15,
 '天津': 16,
 '新疆': 17,
 '重庆': 18,
 '总计': 19,
 '山东': 20,
 '湖北': 21,
 '福建': 22,
 '江苏': 23,
 '贵州': 24,
 '江西': 25,
 '安徽': 26,
 '黑龙江': 27,
 '合计': 28,
 '': 29,
 None: 30}

In [6]:
sars_data.loc[:,'id_city'] = sars_data.loc[:,'city'].map(dict_cities)

In [7]:
sars_data.head()

Unnamed: 0_level_0,city,cumulative,cured,deaths,uncertain,id_city
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-04-21,广东,1317,1136.0,48,,0
2003-04-21,北京,482,43.0,25,610.0,1
2003-04-21,山西,120,6.0,7,61.0,2
2003-04-21,内蒙古,30,,6,48.0,3
2003-04-21,四川,8,3.0,2,7.0,4


## Export

In [8]:
sars_data.to_csv(os.path.join(PATH_OUTPUT_DATA, OUTPUT_DATA_NAME))