# Queensland COVID-19 statistics

The COVID-19 data comes from [Queensland Government](https://www.qld.gov.au/health/conditions/health-alerts/coronavirus-covid-19/current-status/statistics)

In [None]:
import pandas as pd
import requests
import re
from lxml import html
import time
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

In [None]:
# set statistic page url
url = 'https://www.qld.gov.au/health/conditions/health-alerts/coronavirus-covid-19/current-status/statistics'

# use pandas.read_html to read tables from html
data = pd.read_html(url)
# Show the number of tables
print(f'Extracted {len(data)} table/s')

In [None]:
# Case summary
data[0]

In [None]:
# Cases by Hospital and Health Service
data[1]

In [None]:
# Likely source of infection
data[2]

In [None]:
# Cases by Local Government Area and likely source of infection
# C = Council, RC = Regional Council, S = Shire
data[3]

In [None]:
# Age and gender of cases
data[4]

In [None]:
# Testing number
data[5]

In [None]:
# Self-quarantine notices - Queensland
data[6]

In [None]:
# Self-quarantine notices
data[7]

**The method of getting total cases of each day can only use before 31/12/2020**

In [None]:
# prase html and return the whole HTML file in a nice tree structure
page=requests.get(url)
tree = html.fromstring(page.content) 

In [None]:
# get total cases by day html file in text
totalCases = tree.xpath('//*[@id="qg-primary-content"]/script[1]/text()')

# show the html file in text
print(totalCases)

In [None]:
# clean the list of total cases
totalCases = totalCases[0].split("\r\n")

# find date and numbers in the list and put them in a DataFrame
date = re.findall(r"[0-9]+\s[A-z]+",totalCases[2]) 
numbers = re.findall(r'[0-9]+', totalCases[3])
case_by_day = pd.DataFrame(date, columns =['Date'])
case_by_day['Number'] = numbers

In [None]:
# show the info of total cases DataFrame
case_by_day.info()

In [None]:
# add a new column to store DateTime
case_by_day["DateTime"] = case_by_day["Date"].apply(lambda x: time.strptime("2020 "+x, "%Y %d %b"))

# covert Number from object to int64
case_by_day['Number'] = case_by_day['Number'].astype(object).astype(int)
case_by_day.info()

In [None]:
# plot bar chart
fig = plt.figure(figsize=(28,10))
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
dates = case_by_day['Date']
cases = case_by_day['Number']
plt.bar(dates, cases)
plt.title('Daily Confirmed covid-19 cases in Queensland', fontsize=30)
plt.ylabel('Number of confirmeed covid-19 cases', fontsize=24)
plt.xticks(ticks= range(0, len(case_by_day['Date']), 3), rotation=90, fontsize = 14)
plt.yticks(fontsize = 18)
plt.yscale("linear")
plt.show()

In [None]:
# plot line cahrt
fig = plt.figure(figsize=(28,10))
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
dates = case_by_day['Date']
cases = case_by_day['Number']
plt.plot(dates, cases, marker = '.')
plt.title('Daily Confirmed covid-19 cases in Queensland', fontsize=30)
plt.ylabel('Number of confirmeed covid-19 cases', fontsize=24)
plt.xticks(ticks= range(0, len(case_by_day['Date']), 3), rotation=90, fontsize = 14)
plt.yticks(fontsize = 18)
plt.yscale('linear')
plt.show()