In [1]:
import requests
import json
import pandas as pd
import collections
import logging

## 1. List the URL 

### Get the time every 15 mins 

In [2]:
from datetime import datetime, timedelta

def get_time_string_list(start,end,interval):
    start = datetime.strptime(start, "%Y-%m-%d-%H-%M-%S")
    end = datetime.strptime(end, "%Y-%m-%d-%H-%M-%S")
    now = start
    string_list = []
    while now <= end:
        now += timedelta(minutes=15)
        string_list.append(now.strftime("%Y-%m-%d-%H-%M-%S"))
    return string_list

#time_list = get_time_string_list("2019-08-04-23-45-00","2019-12-08-23-45-00",15)

### URL of the data among all sites every 15 mins 

In [19]:
def get_url_list_all_data(time_list):
    url_list = []
    for i in range(len(time_list)):
        if i < len(time_list) - 1:
            url_time_start = time_list[i].split('-')
            url_day_start = url_time_start[0] + '-' + url_time_start[1] + '-' + url_time_start[2] + 'T'
            url_second_start = url_time_start[3] + '%3A' + url_time_start[4] + '%3A00Z'

            url_time_end = time_list[i+1].split('-')
            url_day_end = url_time_end[0] + '-' + url_time_end[1] + '-' + url_time_end[2] + 'T'
            url_second_end = url_time_end[3] + '%3A' + url_time_end[4] + '%3A00Z'    

            url_start = '&start=' + url_day_start + url_second_start
            url_end = '&end=' + url_day_end + url_second_end
            url = 'https://gcc.azure-api.net/traffic/v1/movement/history?size=9999' + url_start + url_end

            #print(url)
            url_list.append(url)

    return url_list

### URL of every site from 2019-10-01 to 2023-09-30

In [4]:
def get_url_list_whole_period(sites_list):
    url_list_all_sites = []
    for site in sites_list:
        url = 'https://gcc.azure-api.net/traffic/v1/movement/history?size=90000&site=' + site + '&start=2019-10-01T00%3A00%3A00Z&end=2023-10-01T00%3A00%3A00Z'
        url_list_all_sites.append(url)

    return url_list_all_sites

## 2. Get data from URL - 1033 sites

In [5]:
def get_data(url):
    response = requests.get(url)
    data = response.json()
    return data

## 3. Reconstruct the data 

In [6]:
def construct_data(data):
    all_point_list = []
    for index in range(len(data)):
        #print(data[index])
        each_point = {}
        for feature, feature_value in data[index].items():
            #print({feature: feature_value})
            if feature != 'site':
                feature_dict = {feature: feature_value}
                #print(feature_dict)
                each_point = {**each_point, **feature_dict}
            else:
                for site_feature, site_feature_value in data[index][feature].items():
                    #print({site_feature: site_feature_value})
                    if site_feature == 'from':
                        for from_feature, from_feature_value in data[index][feature]['from'].items():
                            from_feature_dict = {'origin' + from_feature.capitalize(): from_feature_value}
                            #print(from_feature_dict)
                            each_point = {**each_point, **from_feature_dict}
                    elif site_feature == 'to':
                        for to_feature, to_feature_value in data[index][feature]['to'].items():
                            to_feature_dict = {'destination' + to_feature.capitalize(): to_feature_value}
                            #print(to_feature_dict)
                            each_point = {**each_point, **to_feature_dict}
                    else:
                        site_feature_dict = {'site' + site_feature.capitalize(): site_feature_value}
                        #print(site_feature_dict)
                        each_point = {**each_point, **site_feature_dict}
                    #if site_feature != 'from' and site_feature != 'to':
                        #print({'site' + site_feature.capitalize(): site_feature_value})
        #print(each_point)
        all_point_list.append(each_point)

    all_point_dict = {}
    for feature in all_point_list[0]:
        all_point_dict[feature] = [each_point[feature] for each_point in all_point_list]
    
    all_point_df = pd.DataFrame.from_dict(all_point_dict)
    return all_point_df


## 4. Clean the data - 974 sites

### Clean the data via coords 

In [7]:
def data_clean_coords(all_point_df):
    #clean and rearrange df
    simple_df = all_point_df.drop(columns = ['siteType', 'siteLastupdate', 'siteId', 'site_id', '_id'])
    new_cols = ['type', 'lastUpdate', 'timestamp', 'batchIdentifier', 'siteSiteid', 'id', 'originDescription', 'originLat', 'originLong', 'destinationDescription', 'destinationLat', 'destinationLong', 'flow', 'concentration']
    output_df = simple_df[new_cols]

    #drop the data with zero coords
    data_error = []
    data_error_index = []
    for i in range(output_df.shape[0]):
        if output_df['originLat'][i] == '0':
            ola_olo_dla_dlo = str(i) + '-' + output_df['originLat'][i] + '-' + output_df['originLong'][i] + '-' + output_df['destinationLat'][i] + '-' + output_df['destinationLong'][i]
            data_error.append(ola_olo_dla_dlo)
            data_error_index.append(i)
        elif output_df['originLong'][i] == '0':
            ola_olo_dla_dlo = str(i) + '-' + output_df['originLat'][i] + '-' + output_df['originLong'][i] + '-' + output_df['destinationLat'][i] + '-' + output_df['destinationLong'][i]
            data_error.append(ola_olo_dla_dlo)
            data_error_index.append(i)
        elif output_df['destinationLat'][i] == '0':
            ola_olo_dla_dlo = str(i) + '-' + output_df['originLat'][i] + '-' + output_df['originLong'][i] + '-' + output_df['destinationLat'][i] + '-' + output_df['destinationLong'][i]
            data_error.append(ola_olo_dla_dlo)
            data_error_index.append(i)
        elif output_df['destinationLong'][i] == '0':
            ola_olo_dla_dlo = str(i) + '-' + output_df['originLat'][i] + '-' + output_df['originLong'][i] + '-' + output_df['destinationLat'][i] + '-' + output_df['destinationLong'][i]
            data_error.append(ola_olo_dla_dlo)
            data_error_index.append(i)

    output_df_correct = output_df.drop(data_error_index)
    output_df_correct = output_df_correct.reset_index(drop=True)

    #drop the data with incorrect coords
    data_incorrect_index = []
    data_incorrect = []
    for i in range(output_df_correct.shape[0]):
        if output_df_correct['originLat'][i] < '55':
            ola_dla = str(i) + '-' + output_df_correct['originLat'][i] + '-' + output_df_correct['destinationLat'][i]
            data_incorrect_index.append(i)
            data_incorrect.append(ola_dla)
        elif output_df_correct['destinationLat'][i] < '55':
            ola_dla = str(i) + '-' + output_df_correct['originLat'][i] + '-' + output_df_correct['destinationLat'][i]
            data_incorrect_index.append(i)
            data_incorrect.append(ola_dla)

    output_df_perfect = output_df_correct.drop(data_incorrect_index)
    output_df_perfect = output_df_perfect.reset_index(drop=True)
    
    return output_df_perfect

### Clean the data via timestamp 

In [8]:
def data_unique_time(all_time_data):
    unique_time_dict = {}
    all_time_list = all_time_data['timestamp']
    date_counter = dict(collections.Counter(all_time_list))

    time_list = []
    count_list = []
    for time, count in date_counter.items():
        time_list.append(time)
        count_list.append(count)
    unique_time_dict['timestamp'] = time_list
    unique_time_dict['count'] = count_list

    unique_time_df = pd.DataFrame.from_dict(unique_time_dict)

    #unique the all_time_data dataframe via timestamp
    index = list(all_time_data.timestamp.drop_duplicates().index)
    unique_time_data = all_time_data.iloc[index]
    perfect_time_df = unique_time_data.merge(unique_time_df, how='left', left_on='timestamp', right_on='timestamp')

    #clean and rearrange df
    simple_df = perfect_time_df.drop(columns = ['siteType', 'siteLastupdate', 'siteId', 'site_id', '_id'])
    new_cols = ['lastUpdate', 'timestamp', 'siteSiteid', 'count', 'originDescription', 'originLat', 'originLong', 'destinationDescription', 'destinationLat', 'destinationLong', 'flow', 'concentration', 'batchIdentifier', 'id', 'type']
    output_perfect_df = simple_df[new_cols]

    return output_perfect_df

### Convert coords WGS84 to OSGB36 

In [9]:
from convertbng.util import convert_bng

def convert_coords(great_time_df):
    ola = [float(i) for i in list(great_time_df['originLat'])]
    olo = [float(i) for i in list(great_time_df['originLong'])]
    dla = [float(i) for i in list(great_time_df['destinationLat'])]
    dlo = [float(i) for i in list(great_time_df['destinationLong'])]

    o_OSGB36 = convert_bng(olo, ola)
    d_OSGB36 = convert_bng(dlo, dla)
    o_easting = o_OSGB36[0]
    o_northing = o_OSGB36[1]
    d_easting = d_OSGB36[0]
    d_northing = d_OSGB36[1]

    great_time_df['originEasting'] = o_easting
    great_time_df['originNorthing'] = o_northing
    great_time_df['destinationEasting'] = d_easting
    great_time_df['destinationNorthing'] = d_northing

    return great_time_df

## 5. Implement

### Get the list of recorded sites
Download one interval data to get all the sites with correct coords

In [22]:
time_list = get_time_string_list("2021-12-10-06-00-00","2021-12-10-06-15-00",15)
url_list = get_url_list_all_data(time_list)
for url in url_list:
    data = get_data(url)
    #print(len(data))
    all_point_df = construct_data(data)
    all_sites_df = data_clean_coords(all_point_df)
all_sites_list = list(all_sites_df['siteSiteid'])
print(len(set(all_sites_list)))   

1033


### Download whole research period data with correct sites 

In [None]:
url_list_whole_period = get_url_list_whole_period(all_sites_list)
for url in url_list_whole_period:
    data = get_data(url)
    all_time_data = construct_data(data)
    great_time_df = data_unique_time(all_time_data)
    output_df = convert_coords(great_time_df)
    
    csv_name = url.split('&', 2)[1].split('=')[1]
    output_df.to_csv('data/' + csv_name + '&2019_10_01-2023_09_30.csv', index = False)