In [None]:
from opensearchpy import OpenSearch
import pandas as pd
import json
import time
from tqdm.notebook import tqdm
from tqdm import tqdm
from datetime import datetime

In [None]:
crimes = pd.read_csv('data/Police_Department_Incident_Reports__2018_to_Present.csv', 
                      encoding = 'ISO-8859-1',
                      usecols = ['Incident Datetime', 'Row ID', 'Report Type Description', 'Incident Description', 'Incident Category', 'Incident Subcategory', 'Police District', 'Latitude', 'Longitude'])
new_names = {'Incident Datetime': 'datetime',
             'Row ID': 'row_id',
             'Report Type Description': 'report_type_description',
             'Incident Description': 'incident_description',
             'Incident Category': 'incident_category',
             'Incident Subcategory': 'incident_subcategory',
             'Police District': 'police_district',
             'Latitude': 'latitude',
             'Longitude': 'longitude'}

crimes.rename(columns=new_names, inplace=True)
crimes = crimes[(crimes['latitude'] != 0) & (crimes['longitude'] != 0)] # Remove bad geolocation data
crimes.dropna(subset=['latitude', 'longitude'], inplace=True)
def parse_date(date_str):
    return datetime.strptime(date_str, '%Y/%m/%d %I:%M:%S %p').strftime('%Y-%m-%dT%H:%M:%S%z')
crimes['datetime'] = crimes['datetime'].apply(parse_date)

def combine_lat_long(df):
    df['coordinate'] = df['latitude'].astype(str) + ',' + df['longitude'].astype(str)
    df.drop(['latitude', 'longitude'], axis=1, inplace=True)
    return df
crimes = combine_lat_long(crimes)
crimes

In [None]:
crimes_documents = crimes.to_dict(orient="records")
crimes_documents[0]

In [None]:
client = OpenSearch(
    hosts = [{"host": "localhost", "port": 9200}],
    http_auth = ("admin", "admin"),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
client.info()

In [None]:
client.indices.delete('crimes')

In [None]:
# Check if the index was already created
if not client.indices.exists(index="crimes"):
    # Create the crimes index using a mapping
    mapping = {
            "mappings": {
            "properties": {
                "datetime": {"type": "date"},
                "row_id": {"type": "keyword"},
                "report_type_description": {"type": "text"},
                "incident_category": {"type": "keyword"},
                "incident_subcategory": {"type": "keyword"},
                "incident_description": {"type": "text"},
                "police_district": {"type": "keyword"},
                "coordinate": {"type": "geo_point"},
            }
        }
    }
    client.indices.create(index="crimes", body=mapping)
    print("Created crimes index.")
else:
    print("The crimes index already exists!")

In [None]:
chunk_size = 1000 # Number of documents in a chunk.
chunks = [crimes_documents[index:index + chunk_size] for index in range(0, len(crimes_documents), chunk_size)]

# Process each chunk
for chunk in tqdm(chunks, desc = 'Processing documents', unit = 'chunk'):
    body_list = [] # Save all the OpenSearch commands for this chunk. 

    # Add each document to the bulk insert operation
    for doc in chunk:
        body_list.append({'index': {'_index': 'crimes', '_id': doc['row_id']}})
        body_list.append(doc)
    
    response = client.bulk(body='\n'.join([json.dumps(b) for b in body_list])+'\n') # Commence bulk inserts

In [None]:
response