In [1]:
import influxdb_client
import os
import time
from datetime import datetime
import pandas as pd
from influxdb_client.client.write_api import SYNCHRONOUS,ASYNCHRONOUS
from influxdb_client.client.query_api import TableList
from influxdb_client.client.write_api import Point
import s2cell

In [2]:
# token = os.environ.get("INFLUXDB_TOKEN")
token = "267rK84Nww4rPYmhqRyPAVXgOsZSyL07kzbexhKGMf6nRMHfOfa6KNEMxCVX08QWQvxzBZRVU0DWJazUEoNolg=="
# export your token into the environment variable INFLUXDB_TOKEN first
url = "http://10.10.10.247:8086"
# replace url with your server address
# if your server is on the same machine, use "http://localhost:8086"

# set time 30s, the default is 10s may not be enough in this tutorial
timeout = 60 * 1000
client = influxdb_client.InfluxDBClient(url=url, token=token, timeout=timeout)

In [3]:
# In this tutoriakl we will investigate the data from Hartford Police Department
# This historical dataset reflects reported incidents of crime (with the execption of sexual assaults)
# that occurred in the City of Hartford from January 1, 2005 to May 18, 2021.

# download link
# - https://data.hartford.gov/datasets/hartfordgis::police-incidents-01012005-to-05182021/about
data = pd.read_csv("./Police_Incidents_01012005_to_05182021.csv", index_col=False)

In [4]:
data.head()

Unnamed: 0,X,Y,OBJECTID,Case_Number,Date,Time_24HR,Address,UCR_1_Category,UCR_1_Description,UCR_1_Code,UCR_2_Category,UCR_2_Description,UCR_2_Code,Neighborhood,PRIMARY_KEY
0,1018388.0,836860.062479,1,5001564,2005/01/11 00:00:00+00,1630,161 WASHINGTON ST,19* - CRIMES AGAINST THE PUBLIC,BREACH-PEACE,1901,,,0,FROG HOLLOW,CIRS-5001564-0
1,1014889.0,853267.562472,2,5001565,2005/01/11 00:00:00+00,1613,587 BLUE HILLS AV,51* - MISC. MANAGEMENT INFO.,COMM TENSION;COMM-SERVICE,5104,,,0,BLUE HILLS,CIRS-5001565-0
2,1019379.0,829663.7501,3,5001567,2005/01/10 00:00:00+00,2000,29 DOUGLAS ST,19* - CRIMES AGAINST THE PUBLIC,DOMESTIC,1904,19* - CRIMES AGAINST THE PUBLIC,DOMESTIC,1904,SOUTHEND,CIRS-5001567-0
3,1017282.0,838990.000148,4,5001569,2005/01/11 00:00:00+00,1645,BROAD ST & CAPITOL AV,29* - FOUND PERSON/PROPERTY,M-V-S-O-T-R-L,2905,,,0,FROG HOLLOW,CIRS-5001569-0
4,1014887.0,839367.143423,5,5000476,2005/01/04 00:00:00+00,830,30 HAWTHORN ST,35* - MISC. CRIMES AGAINST PROPERTY,CR MISCHIEF 3,3503,,,0,ASYLUM HILL,CIRS-5000476-0


In [5]:
# if your machine can not hold the data, you can use the following code to select data in a specific time range
data = data[(data["Date"] >= "2019-01-01")]
print(data.shape)

(90322, 15)


In [6]:
# preprocess the data heuristically
# we skip the data investigation step and directly give the conclusion and preprocess

# 1. lowercase all columns
data.columns = data.columns.str.lower()


# 2. X(longitutde), Y(altitue) -> combine by s2cell as location feature
S2_LEVEL = 10
combine_xy = lambda x: s2cell.lat_lon_to_token(x["x"], x["y"], S2_LEVEL)
data["location"] = data[data["x"].notna() & data["y"].notna()].apply(combine_xy, axis=1)



# 3. combine the Date and Time_24HR columns into a single datetime column
candidates = zip(pd.to_datetime(data["date"]).to_list(), data["time_24hr"].to_list())
datetime_values = []
for (dts, hms) in candidates:
    if hms < 60 :
        hr = 0
        m = hms
    else:
        hms_ = str(hms)
        hr, m = int(hms_[:-2]), int(hms_[-2:])
    datetime_values.append(datetime.fromtimestamp(dts.timestamp() + hr * 3600 + m * 60))
data["datetime"] = pd.to_datetime(datetime_values)


# 4. remove the duplicated columns OBJECTID, Case_Number
# these columns have high cardinality and are duplicated with PrimaryKey
data = data.drop(columns=["objectid", "case_number","date", "time_24hr"]) 


In [7]:
for column in data.columns:
    if data[column].isna().any():
        print(f"column {column} has NaN values")

column ucr_2_category has NaN values
column ucr_2_description has NaN values


In [8]:
data.head()

Unnamed: 0,x,y,address,ucr_1_category,ucr_1_description,ucr_1_code,ucr_2_category,ucr_2_description,ucr_2_code,neighborhood,primary_key,location,datetime
588689,1018476.0,836659.236765,172 WASHINGTON ST,06* - LARCENY,LARC4-MISCELL,685,,,0,SOUTH GREEN,CIRS-19037202-0,136fe1,2019-11-16 13:16:00
588690,1014989.0,849540.060085,237 RIDGEFIELD ST,06* - LARCENY,LARC4-BUILDING,665,,,0,BLUE HILLS,CIRS-19037207-0,344b5b,2019-11-14 00:00:00
588691,1014402.0,850350.199869,351 BLUE HILLS AV,52* - SHOTS FIRED,SHOTS FIRED - UNCONFIRMED,5211,52* - SHOTS FIRED,SHOTS SPOTTER,5212,BLUE HILLS,CIRS-19037210-0,b05f05,2019-11-16 15:00:00
588692,1019147.0,848978.169804,170 WESTLAND ST,05* - BURGLARY,BURG1-RES-DAY,502,,,0,NORTHEAST,CIRS-19037218-0,2f0535,2019-11-16 10:45:00
588693,1012601.0,848372.560123,114 SHARON ST,52* - SHOTS FIRED,SHOTS FIRED - CONFIRMED,5210,52* - SHOTS FIRED,SHOTS SPOTTER,5212,BLUE HILLS,CIRS-19037232-0,a50c89,2019-11-16 19:24:00


In [9]:
# After normalizations, we inserted these data into the influxDB
# InfluxDB data schema design like this:
# - Bucket: Hartford
# - Measurement: police_incidents
# - Tags: 
#   - location: S2 cell token
#   - neighborhood: neighborhood name
#   - address: address
#   - ucr_1_code
#   - ucr_2_code
#   - ucr_1_category
#   - ucr_2_category
#   - ucr_1_description
#   - ucr_2_description
# - Fields:
#   - primary_key
#   - lon
#   - lat
# - Time: datetime

# In order to compare the difference of field and tag in query performance.
# We create another bucket

In [10]:
BucketName = "Hartford"
DEFAULT_ORG = "docs"
bucket = client.buckets_api().find_bucket_by_name(bucket_name=BucketName)
if bucket:
    # bucket exist , reset it
    print("bucket exist, delete it")
    client.buckets_api().delete_bucket(bucket)

# create bucket
bucket = client.buckets_api().create_bucket(bucket_name=BucketName, org_id=DEFAULT_ORG)
if bucket:
    print(f"bucket {BucketName} created")

bucket exist, delete it
bucket Hartford created


In [11]:
# insert these data into InfluxDB
# [WARNING] before inserting through pd.Dataframe, we should guarantee there is no NaN value in the dataframe
# OtherWise, the client will raise an error
# @lingze: plz check above statement.
 
# write_api = client.write_api(write_options=SYNCHRONOUS)
# measurement = "police_incidents"
# start = time.time()
# batch_size = 2**15

# for i in range(0, len(data), batch_size):
#     data_batch = data.iloc[i:i+batch_size]
#     write_api.write(
#         bucket = BucketName,
#         org = DEFAULT_ORG,
#         record = data_batch,
#         data_frame_measurement_name = measurement,
#         data_frame_tag_columns = [
#             "location", 
#             "neighborhood", 
#             "address", 
#             "ucr_1_code", 
#             "ucr_2_code", 
#             "ucr_1_category",
#             "ucr_2_category", 
#             "ucr_1_description", 
#             "ucr_2_description"
#         ],
#         data_frame_field_columns = ["primary_key"],
#         data_frame_time_index = "datetime"
#     )
#     print(f"{i}/{len(data)} inserted")
# print("==> finished")

In [12]:
# first we divide the data into two parts, with NaN and w/o Nan
nan_mask = data.isna().any(axis = 1)
data_with_nan = data[nan_mask]
data_without_nan = data[~nan_mask]

In [13]:
write_api = client.write_api(write_options=SYNCHRONOUS)
measurement = "police_incidents"

In [14]:
# for data without Nan, we insert through DataFrame easily
start = time.time()
batch_size = 2**15
for i in range(0, len(data_without_nan), batch_size):
    batch_data = data_without_nan.iloc[i:i+batch_size]
    write_api.write(
            bucket = BucketName,
            org = DEFAULT_ORG,
            record = batch_data,
            data_frame_measurement_name = measurement,
            data_frame_tag_columns = [
                "location", 
                "neighborhood", 
                "address", 
                "ucr_1_code", 
                "ucr_2_code", 
                "ucr_1_category",
                "ucr_2_category", 
                "ucr_1_description", 
                "ucr_2_description"
            ],
            data_frame_field_columns = [
                "primary_key",
                "x",
                "y"
            ],
            data_frame_timestamp_column = "datetime"
        )
    print(f"{i}/{len(data_without_nan)} inserted")
end = time.time()
print(f"==> finished in {end - start} seconds")

0/46318 inserted
32768/46318 inserted
==> finished in 1.8253858089447021 seconds


In [15]:
# for data with Nan, we insert by constructing Point
nan_mask = data_with_nan.isna()
records = data_with_nan.to_dict(orient="records")
record_key_mask = data.isna().to_dict(orient="records")

tags_columns = [
    "location", 
    "neighborhood", 
    "address", 
    "ucr_1_code", 
    "ucr_2_code", 
    "ucr_1_category",
    "ucr_2_category", 
    "ucr_1_description", 
    "ucr_2_description"
]

fields_columns = [
    "primary_key",
    "x",
    "y"
]

time_column = "datetime"

In [16]:
points = []
batch_size = 2**15
start = time.time()
for idx,(record, key_mask) in enumerate(zip(records, record_key_mask)):
    point_dict = {
        "measurement": measurement
    }
    record_without_nan = {k: v for k, v in record.items() if not key_mask[k]}
    point_dict['tags']={k: v for k, v in record_without_nan.items() if k in tags_columns}
    point_dict['fields'] = {k: v for k, v in record_without_nan.items() if k in fields_columns}
    point_dict['time'] = record_without_nan[time_column]
    points.append(Point.from_dict(dictionary=point_dict))
    
    if (idx + 1) % batch_size == 0 or (idx + 1) == len(records):
        write_api.write(bucket=BucketName, org=DEFAULT_ORG, record=points)
        print(f"{idx}/{len(records)} inserted")
        points = []

end = time.time()
print(f"==> finished in {end - start} seconds")


32767/44004 inserted
44003/44004 inserted
==> finished in 2.4716796875 seconds


In [17]:
query_api = client.query_api()

In [18]:
# First Query
# Question: Top-10 most common types of polices cases in all time
query = """
import "influxdata/influxdb/v1"
option v = {timeRangeStart:1970-01-01T00:00:00Z , timeRangeStop: now()}

from(bucket: "Hartford")
    |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
    |> filter(fn: (r) => r["_measurement"] == "police_incidents" and r["_field"] == "primary_key")
    |> group(columns: ["ucr_1_code", "ucr_1_category", "ucr_1_description"])
    |> count(column: "_value") // count the number of records per group
    |> group() // Ungroup to allow sorting across all groups
    |> sort(columns: ["_value"], desc: true) // Sort by count in descending order
    |> limit(n:10)
    |> keep(columns: ["ucr_1_code", "_value", "ucr_1_category", "ucr_1_description"]) // Specify columns to retain
"""
start = time.time()
tables = query_api.query(query=query, org=DEFAULT_ORG)
end = time.time()
print(f"==> query finished in {end - start} seconds")
tables.to_values()


==> query finished in 25.178399801254272 seconds


[dict_values(['_result', 0, 7249, '1901', '19* - CRIMES AGAINST THE PUBLIC', 'BREACH-PEACE             ']),
 dict_values(['_result', 0, 4370, '5104', '51* - MISC. MANAGEMENT INFO.', 'COMM TENSION;COMM-SERVICE']),
 dict_values(['_result', 0, 3751, '3224', '32* - PROPERTY DAMAGE ACCIDENT', 'PROP DAM ACC             ']),
 dict_values(['_result', 0, 3657, '2903', '29* - FOUND PERSON/PROPERTY', 'ABANDONED M/V            ']),
 dict_values(['_result', 0, 3522, '3221', '32* - PROPERTY DAMAGE ACCIDENT', 'PROP DAM ACC             ']),
 dict_values(['_result', 0, 3334, '2090', '20* - RADIO SIGNAL', 'RADIO SIGNAL             ']),
 dict_values(['_result', 0, 3180, '3503', '35* - MISC. CRIMES AGAINST PROPERTY', 'CR MISCHIEF 3            ']),
 dict_values(['_result', 0, 3026, '2331', '23* - DRIVING LAWS', 'PARKING VIOLATION        ']),
 dict_values(['_result', 0, 2695, '5211', '52* - SHOTS FIRED', 'SHOTS FIRED - UNCONFIRMED']),
 dict_values(['_result', 0, 2395, '801', '08* - SIMPLE ASSAULT', 'ASSAULT

In [19]:
# Second Query
# Question: Number of cases with code "1901" over time grouped by week
query = """
import "influxdata/influxdb/v1"
option v = {timeRangeStart:1970-01-01T00:00:00Z , timeRangeStop: now()}

from(bucket: "Hartford")
    |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
    |> filter(fn: (r) => r["_measurement"] == "police_incidents" and r["_field"] == "primary_key")
    |> filter(fn: (r) => r["ucr_1_code"] == "1901")
    |> truncateTimeColumn(unit: 1w) // Truncate time to week
    |> group(columns: ["_time"])
    |> count(column: "_value")
    |> group()
"""
start = time.time()
tables = query_api.query(query=query, org=DEFAULT_ORG)
end = time.time()
print(f"==> query finished in {end - start} seconds")
tables.to_values()
# better to execute above result in the InfluxDB UI
# it will visualize the result in timeline graph, more intuitive

==> query finished in 3.6595773696899414 seconds


[dict_values(['_result', 0, datetime.datetime(2018, 12, 27, 0, 0, tzinfo=tzlocal()), 14]),
 dict_values(['_result', 0, datetime.datetime(2019, 1, 3, 0, 0, tzinfo=tzlocal()), 36]),
 dict_values(['_result', 0, datetime.datetime(2019, 1, 10, 0, 0, tzinfo=tzlocal()), 42]),
 dict_values(['_result', 0, datetime.datetime(2019, 1, 17, 0, 0, tzinfo=tzlocal()), 46]),
 dict_values(['_result', 0, datetime.datetime(2019, 1, 24, 0, 0, tzinfo=tzlocal()), 39]),
 dict_values(['_result', 0, datetime.datetime(2019, 1, 31, 0, 0, tzinfo=tzlocal()), 48]),
 dict_values(['_result', 0, datetime.datetime(2019, 2, 7, 0, 0, tzinfo=tzlocal()), 52]),
 dict_values(['_result', 0, datetime.datetime(2019, 2, 14, 0, 0, tzinfo=tzlocal()), 40]),
 dict_values(['_result', 0, datetime.datetime(2019, 2, 21, 0, 0, tzinfo=tzlocal()), 47]),
 dict_values(['_result', 0, datetime.datetime(2019, 2, 28, 0, 0, tzinfo=tzlocal()), 36]),
 dict_values(['_result', 0, datetime.datetime(2019, 3, 7, 0, 0, tzinfo=tzlocal()), 48]),
 dict_values

In [20]:
# Third Query

# Question: the latest incident (most recent time) for the top ten ucr_1_code from police_incidents measurement 

# we need to utilize the result of Query 1
query = """
import "influxdata/influxdb/v1"
option v = {timeRangeStart:1970-01-01T00:00:00Z , timeRangeStop: now()}
codeset = ["1901", "5104", "3224", "2903", "3221", "2090", "3503", "2331", "5211", "801"]
from(bucket: "Hartford")
    |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
    |> filter(fn: (r) => r["_measurement"] == "police_incidents")
    |> filter(fn: (r) => r["_field"] == "primary_key")
    |> filter(fn: (r) => contains(value: r.ucr_1_code, set:codeset) ) 
    |> group(columns: ["ucr_1_code"])  // Group by ucr_1_code
    |> sort(columns: ["_time"], desc: true)  // Sort by time, most recent first
    |> limit(n: 10)  // Limit to the top 10 most recent incidents
    |> last()  // Get the most recent incident for each ucr_1_code
    |> keep(columns: ["ucr_1_code", "_value", "ucr_1_category", "ucr_1_description", "_time", "address"])
    |> yield(name: "Latest Incident by ucr_1_code")
"""
start = time.time()
tables = query_api.query(query=query, org=DEFAULT_ORG)
end = time.time()
print(f"==> query finished in {end - start} seconds")
tables.to_values()

==> query finished in 26.107426166534424 seconds


[dict_values(['Latest Incident by ucr_1_code', 0, datetime.datetime(2021, 5, 17, 2, 31, tzinfo=tzlocal()), 'CIRS-21014480-0    ', '60 CAMPFIELD AV', '19* - CRIMES AGAINST THE PUBLIC', '1901', 'BREACH-PEACE             ']),
 dict_values(['Latest Incident by ucr_1_code', 1, datetime.datetime(2021, 5, 16, 17, 45, tzinfo=tzlocal()), 'CIRS-21014436-0    ', '24 MERRILL ST', '20* - RADIO SIGNAL', '2090', 'RADIO SIGNAL             ']),
 dict_values(['Latest Incident by ucr_1_code', 2, datetime.datetime(2021, 5, 14, 13, 33, tzinfo=tzlocal()), 'CIRS-21014211-0    ', '203 TRUMBULL ST', '23* - DRIVING LAWS', '2331', 'PARKING VIOLATION        ']),
 dict_values(['Latest Incident by ucr_1_code', 3, datetime.datetime(2021, 5, 12, 17, 50, tzinfo=tzlocal()), 'CIRS-21014004-0    ', '69 CURTISS ST', '29* - FOUND PERSON/PROPERTY', '2903', 'ABANDONED M/V            ']),
 dict_values(['Latest Incident by ucr_1_code', 4, datetime.datetime(2021, 5, 14, 20, 15, tzinfo=tzlocal()), 'CIRS-21014250-0    ', 'COGSWEL

In [21]:
# Fourth Query

# Question: Number of crime cases within a radius of Hartford, CT

# first, we can get the HartFord CI coordinates is (lat, lon) = (X, Y) = (30.04, 31.23)
# 30 miles is 48.2803 km

x_mean = data["x"].mean()
y_mean = data["y"].mean()
x_std = data["x"].std()
y_std = data["y"].std()
radius = (x_std**2 + y_std**2)**(1/2)
print(f"x_mean: {x_mean}, y_mean: {y_mean}, x_std: {x_std}, y_std: {y_std}, radius: {radius}")

query = """
import "influxdata/influxdb/v1"
import "experimental/geo"

option v = {timeRangeStart:1970-01-01T00:00:00Z , timeRangeStop: now()}

from(bucket: "Hartford")
    |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
    |> filter(fn: (r) => r._measurement == "policeincidents" and r._field == "x" or r.field == "y")
    |> geo.shapeData(latField: "x", lonField: "y", level:10)
    |> geo.filterRows(region: {lat: 1017189, lon: 839065, radius: 7500}, strict: false)  
    |> group()
"""
start = time.time()
tables = query_api.query(query=query, org=DEFAULT_ORG)
end = time.time()
print(f"==> query finished in {end - start} seconds")
tables.to_values()

x_mean: 1017189.1215648074, y_mean: 839065.748621916, x_std: 3500.0514335853445, y_std: 6721.366808089502, radius: 7578.069134458335


ApiException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json; charset=utf-8', 'Vary': 'Accept-Encoding', 'X-Influxdb-Build': 'OSS', 'X-Influxdb-Version': 'v2.7.11', 'X-Platform-Error-Code': 'not found', 'Date': 'Thu, 09 Jan 2025 14:36:09 GMT', 'Transfer-Encoding': 'chunked'})
HTTP response body: b'{"code":"not found","message":"error calling function \\"filterRows\\" @11:8-11:88: error calling function \\"gridFilter\\" @experimental/geo/geo.flux|865:24-871:22: error calling function \\"tableFind\\" @experimental/geo/geo.flux|505:16-505:61: no table found"}'
