In [1]:
import influxdb_client
import os
import time
import pandas as pd
from influxdb_client.client.write_api import SYNCHRONOUS
from influxdb_client.client.query_api import TableList

In [2]:
token = "MyInitialAdminToken0=="
# token = os.environ.get("INFLUXDB_TOKEN")
# export your token into the environment variable INFLUXDB_TOKEN first
# or you can just replace "os.environ.get("INFLUXDB_TOKEN")" with your token
# it is not recommended to hardcode your token in the code, simple but unsafe
url = "http://10.10.10.250:8086"
# replace url with your server address
# if your server is on the same machine, use "http://localhost:8086"

client = influxdb_client.InfluxDBClient(url=url, token=token)

In [3]:
# In this tutorial we will use publicly available dataset COVID-19 in schools
# The State of Connecticut provides numbers for the School COVID-19 cases through 2020-2022 in multiple formats,
# among other useful information about the pandemic.
# download link 
# - https://data.ct.gov/Health-and-Human-Services/COVID-19-Cases-in-CT-Schools-By-School-2021-2022-S/8xd9-2eym/data_preview
# - https://data.ct.gov/Health-and-Human-Services/COVID-19-Cases-in-CT-Schools-By-School-2020-2021-S/u8jq-fxc2/about_data


data_2021 = pd.read_csv("./covid-19_cases_in_ct_schooles_2021-2022.csv", index_col=False)
data_2020 = pd.read_csv("./covid-19_cases_in_ct_schooles_2020-2021.csv", index_col=False)

In [4]:
data_2021.head()

Unnamed: 0,District,School Name,City,Report Period,Total Cases,Academic Year,Date Updated
0,Bridgeport School District,A Child's World,Bridgeport,08/26/2021 - 09/01/2021,0,2021-2022,06/30/2022
1,Guilford School District,A. Baldwin Middle School,Guilford,08/26/2021 - 09/01/2021,0,2021-2022,06/30/2022
2,Connecticut Technical Education and Career System,A. I. Prince Technical High School,Hartford,08/26/2021 - 09/01/2021,0,2021-2022,06/30/2022
3,Guilford School District,A. W. Cox School,Guilford,08/26/2021 - 09/01/2021,0,2021-2022,06/30/2022
4,Suffield School District,A. Ward Spaulding School,Suffield,08/26/2021 - 09/01/2021,0,2021-2022,06/30/2022


In [5]:
data_2020.head()

Unnamed: 0,District,School ID,School name,City,School total,Report period,Date updated
0,Andover School District,1402,Andover Elementary School,Andover,0,10/08/2020 - 10/14/2020,06/23/2021
1,Andover School District,1402,Andover Elementary School,Andover,0,10/15/2020 - 10/21/2020,06/23/2021
2,Andover School District,1402,Andover Elementary School,Andover,0,10/22/2020 - 10/28/2020,06/23/2021
3,Andover School District,1402,Andover Elementary School,Andover,0,10/29/2020 - 11/04/2020,06/23/2021
4,Andover School District,1402,Andover Elementary School,Andover,0,11/05/2020 - 11/11/2020,06/23/2021


In [6]:
# align these two data

# for data_2020:
# - remove "School ID" column
# - change column name "School total" to "Total Cases"
# - add column "Academic Year" = "2020-2021"

data_2020.drop(columns = ["School ID"], inplace=True)
data_2020.rename(columns = {"School total": "Total Cases"}, inplace = True)
data_2020["Academic Year"] = "2020-2021"

# lowercase all column names
data_2020.columns = data_2020.columns.str.lower()
data_2021.columns = data_2021.columns.str.lower()

In [7]:
# we take a look at the data schema
data = pd.concat([data_2020, data_2021], ignore_index=True)
data.head()

Unnamed: 0,district,school name,city,total cases,report period,date updated,academic year
0,Andover School District,Andover Elementary School,Andover,0,10/08/2020 - 10/14/2020,06/23/2021,2020-2021
1,Andover School District,Andover Elementary School,Andover,0,10/15/2020 - 10/21/2020,06/23/2021,2020-2021
2,Andover School District,Andover Elementary School,Andover,0,10/22/2020 - 10/28/2020,06/23/2021,2020-2021
3,Andover School District,Andover Elementary School,Andover,0,10/29/2020 - 11/04/2020,06/23/2021,2020-2021
4,Andover School District,Andover Elementary School,Andover,0,11/05/2020 - 11/11/2020,06/23/2021,2020-2021


In [8]:
# Normalize the data before inserting into InfluxDB
# However, this preprocessing part is unrelated the InfluxDB, so we will skip the mining process 
# directly give the conclusion and the final data schema


# - The report period (report period): 
# we could explode this measurement for the time period, but select the first date of the selected range
# This means we will see updates for each town, every 7 days.

# - The school total cases (total cases):
# there is "<6" value, we convert it to 5.
# total cases should be a numerical data, we consider whose data can not be converted to numerical as missing data
# and filter out the rows with missing data


# - The meta data (school name, city, district). 
# In the case of influxdb this constitutes a 'data point' and once defined it cannot be changed.

data['report period'] = data['report period'].str.split(" - ").str[0]

data.dropna(inplace=True)
data['total cases'] = data['total cases'].apply(lambda x: '5' if x == "<6" else x)
# others value all use string


# convert total cases to integer
data['total cases'] = data['total cases'].astype(int)


# convert report perid to timestamp
data['report period'] = pd.to_datetime(data['report period'])


In [9]:
# InfluxDB schema design for COVID-19 cases in schools dataset.
# - The bucket, we set 'covid-19-schools'

# - The Measurements (equivalent to a table in a RDBMS), we set 'cases'

# - The Tags, a combination of keys and values used to annotate data (think of them as metadata)
#  'school name', 'district', 'city'

# - Fields: the actual data being stored. Fields are typed and can be one of :int, float(default), or string
# Though the value can changeover time, their type is fixed and must be consistent over time.
# In our example, 'total cases' is a field.

# - Time: the fabric of our data. Each Point requires a timestamp.
# We derive the timestamp from the 'report period' column.

In [10]:
BUCKET_NAME = "covid-schools"
DEFAULT_ORG = "docs"
bucket = client.buckets_api().find_bucket_by_name(bucket_name=BUCKET_NAME)
if bucket:
    # bucket exist. reset it.
    print(f"Delete existing bucket {BUCKET_NAME}")
    client.buckets_api().delete_bucket(bucket)

bucket = client.buckets_api().create_bucket(bucket_name=BUCKET_NAME, org_id=DEFAULT_ORG)
if bucket:
    print(f"Create new bucket {BUCKET_NAME}")

Create new bucket covid-schools


In [11]:
# insert these data into InfluxDB
write_api = client.write_api(write_options=SYNCHRONOUS)
MEASUREMENT = "cases"
write_api.write(
    bucket=BUCKET_NAME,
    org=DEFAULT_ORG,
    record = data,
    data_frame_measurement_name =  MEASUREMENT,
    data_frame_tag_columns = ["school name", "district", "city", "academic year", "date updated"],
    data_frame_field_columns = ["total cases"],
    data_frame_timestamp_column = "report period",
)


In [12]:
query_api = client.query_api()
query = """from(bucket: "covid-schools")
|> range(start: -3y)
|> filter(fn: (r) => r.city == "Greenwich")
|> yield() """

# FluxQL query
# retrieve the last 3 years data for the city of Greenwich
tables: TableList = query_api.query(query, org="docs")

# [Suggestion]: better to execute this query in the InfluxDB UI, which will visualize the data for you.

# to_value() will convert the result to a list of record Dict 
# we check the head of the result
tables.to_values()[:5]

[dict_values(['_result', 0, datetime.datetime(2022, 2, 14, 20, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2025, 2, 14, 14, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2022, 5, 5, 0, 0, tzinfo=tzlocal()), 0, 'total cases', 'cases', '2021-2022', 'Greenwich', '06/16/2022', 'Greenwich School District', 'Abilis']),
 dict_values(['_result', 0, datetime.datetime(2022, 2, 14, 20, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2025, 2, 14, 14, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2022, 5, 12, 0, 0, tzinfo=tzlocal()), 0, 'total cases', 'cases', '2021-2022', 'Greenwich', '06/16/2022', 'Greenwich School District', 'Abilis']),
 dict_values(['_result', 0, datetime.datetime(2022, 2, 14, 20, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2025, 2, 14, 14, 8, 6, 578051, tzinfo=tzlocal()), datetime.datetime(2022, 5, 19, 0, 0, tzinfo=tzlocal()), 0, 'total cases', 'cases', '2021-2022', 'Greenwich', '06/16/2022', 'Greenwich School District', 'Abilis']),
 dict_values(['_r

In [13]:
query = """from(bucket:"covid-schools")
|> range(start: -3y)
|> filter(fn: (r) => r._measurement == "cases" and r._field == "total cases")
|> group(columns: ["city"])
|> drop(columns: ["_start", "_stop"])
|> sum()
"""
# FluxQL query
# group the data by city and collect the total cases for each city

tables:TableList = query_api.query(query, org="docs")

tables.to_values()[:5]

[dict_values(['_result', 0, 'Andover', 63]),
 dict_values(['_result', 1, 'Ansonia', 122]),
 dict_values(['_result', 2, 'Ashford', 93]),
 dict_values(['_result', 3, 'Avon', 1052]),
 dict_values(['_result', 4, 'Barkhamsted', 50])]

In [14]:
# we can directly get the DataFrame result when quering using query_data_frame()
df = query_api.query_data_frame(query, org="docs")
df.head()

# following the warning msg to adjust your query message to get the optimal pandas.DateFrame result.


The result will not be shaped to optimal processing by pandas.DataFrame. Use the pivot() function by:

    from(bucket:"covid-schools")
|> range(start: -3y)
|> filter(fn: (r) => r._measurement == "cases" and r._field == "total cases")
|> group(columns: ["city"])
|> drop(columns: ["_start", "_stop"])
|> sum()
 |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")



For more info see:
    - https://docs.influxdata.com/resources/videos/pivots-in-flux/
    - https://docs.influxdata.com/flux/latest/stdlib/universe/pivot/
    - https://docs.influxdata.com/flux/latest/stdlib/influxdata/influxdb/schema/fieldsascols/



Unnamed: 0,result,table,city,_value
0,_result,0,Andover,63
1,_result,1,Ansonia,122
2,_result,2,Ashford,93
3,_result,3,Avon,1052
4,_result,4,Barkhamsted,50


In [15]:
query = """from(bucket:"covid-schools")
|> range(start: -3y)
|> filter(fn: (r) => r._measurement == "cases" and r._field == "total cases")
|> group(columns: ["city"])
|> drop(columns: ["_start", "_stop"])
|> sum()
|> filter(fn: (r) => r._value > 700)
"""
# Flux is a functional language, and you can keep piping the result to other filter.
# Here we filter out the city with total cases less than 700


tables:TableList = query_api.query(query, org="docs")
print(f"there are {len(tables)} cities with total cases more than 700")
tables.to_values()[:5]

there are 48 cities with total cases more than 700


[dict_values(['_result', 0, 'Avon', 1052]),
 dict_values(['_result', 1, 'Bloomfield', 718]),
 dict_values(['_result', 2, 'Bridgeport', 1056]),
 dict_values(['_result', 3, 'Cheshire', 1582]),
 dict_values(['_result', 4, 'Danbury', 2053])]