In [None]:
import geopandas as gpd
import pandas as pd
import collections
from collections import Counter
from scipy.stats import entropy
import numpy as np
import os

## POI

## 1. Make defination 

### 1.1 Read detector(buffer)_POI  join data

In [3]:
def read_detector_POI_join_file(detector_POI_join_shp):
    detector_POI_join = gpd.read_file("../20211028_SCOOT_Data/SCOOT_data_points-shp/Spatial Join/" + detector_POI_join_shp + ".shp")
    
    return detector_POI_join

### 1.2 Get POI num & percentage of each site

In [4]:
def percentage_on_POI(detector_POI_join):

    valid_points = detector_POI_join.loc[detector_POI_join.JOIN_FID != -1, 'siteId']
    points = sorted(list(set(valid_points)))

    df_list = []
    for i in points:
        b = detector_POI_join.loc[detector_POI_join.siteId == i]
        c = dict(Counter(b['groupname']))  #count each group of each site
        percent_c = {key: value/len(b) for key, value in c.items()} #convert count to percentage    
        percent_c['POI_num'] = len(b)
        percent_c['siteId'] = i
        each_df = pd.DataFrame(percent_c, index=[i])
        df_list.append(each_df) 

    df_POI_percent = pd.concat(df_list, ignore_index=True, sort=False).fillna(0)
    df_POI_percent = df_POI_percent[['siteId', 'POI_num', 'Public Infrastructure', 'Commercial Services', 'Education and Health', 'Transport', 'Retail', 'Sport and Entertainment', 'Accommodation, Eating and Drinking', 'Manufacturing and Production', 'Attractions']]
    df_POI_percent = df_POI_percent.sort_values(by=['siteId']).reset_index(drop=True)

    return df_POI_percent

### 1.3 Get the degree of mixing POI (entropy)

In [5]:
def get_POI_mix(df_POI_percent):

    df_POI_percent['POI_mix'] = 0
    
    for i in range(len(df_POI_percent)):
        POI_percent = df_POI_percent.iloc[i,2:].tolist()
        POI_percent = [x for x in POI_percent if x != 0]
        POI_mix = entropy(POI_percent)/np.log(len(POI_percent))
        df_POI_percent.loc[i, 'POI_mix'] = POI_mix
    
    all_df_POI_percent = df_POI_percent.fillna(0)
    return all_df_POI_percent

### 1.4 Get statistics

In [31]:
def get_sta(df_POI_percent, buffer_size):
    POI_sta = df_POI_percent.describe().T[['mean','std','min','max']]
    POI_sta = POI_sta.add_suffix('_' + buffer_size)
    return POI_sta

## 2. Implement

### 2.1 Get POI percent of all buffer size

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    detector_POI_join = read_detector_POI_join_file('useful_detector_POI_join_' + doc)
    df_POI_percent = percentage_on_POI(detector_POI_join)
    all_df_POI_percent = get_POI_mix(df_POI_percent)
    
    site_list =  all_df_POI_percent.siteId.tolist()
    all_df_POI_percent = all_df_POI_percent.drop('siteId', axis=1).add_suffix('_' + doc)
    
    df_list.append(all_df_POI_percent)

all_POI = pd.concat(df_list, axis=1).fillna(0)
all_POI.insert(loc=0, column='siteId', value=site_list)
all_POI

#all_POI.to_csv('all_buffer_POI_percent.csv', index = False)

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    detector_POI_join = read_detector_POI_join_file('useful_detector_POI_join_' + doc)
    df_POI_percent = percentage_on_POI(detector_POI_join)
    all_df_POI_percent = get_POI_mix(df_POI_percent)
    POI_sta = get_sta(all_df_POI_percent, doc)
    df_list.append(POI_sta)

all_sta = pd.concat(df_list, axis=1)
all_sta

## Landcover

## 1. Make defination 

### 1.1 Read buffer_landcover intersect data

In [3]:
def read_landcover_intersect_file(buffer_landcover_intersect_csv):
    landcover_intersect = gpd.read_file("../20211028_SCOOT_Data/SCOOT_data_points-shp/Spatial Join/" + buffer_landcover_intersect_csv + ".csv")
    
    return landcover_intersect

### 1.2 Organise landcover percentage

In [4]:
def get_landcover_percent(landcover_intersect):
    
    sites = list(set(landcover_intersect.siteId))

    df_list = []
    for site in sites:
        each_site_landcover = landcover_intersect.loc[landcover_intersect['siteId'] == site].loc[:,['class_2018','PERCENTAGE']]
        landcover_dict = dict(each_site_landcover.values)
        landcover_dict['siteId'] = site
        landcover_df = pd.DataFrame(landcover_dict, index=[site])
        df_list.append(landcover_df) 

    df_landcover_percent = pd.concat(df_list, ignore_index=True, sort=False).fillna(0)

    #change the column order
    site_list = df_landcover_percent.siteId.tolist()
    df_landcover_percent = df_landcover_percent.drop('siteId', axis=1).astype(float)/100 #convert to float
    df_landcover_percent.insert(loc=0, column='siteId', value=site_list)

    df_landcover_percent = df_landcover_percent.sort_values(by=['siteId']).reset_index(drop=True)

    return df_landcover_percent

### 1.3 Merge the landcover class into 7 group

In [14]:
def merge_landcover_class(df_landcover_percent):

    Landcover_osgb = gpd.read_file("Land_cover_shp/land_cover_osgb36.shp")
    landcover_class = sorted(list(set(Landcover_osgb.class_2018)))

    urban_residential_area = [landcover_class[2],landcover_class[3], landcover_class[4], landcover_class[5], landcover_class[6], landcover_class[12]]
    road_railway = [landcover_class[7], landcover_class[15], landcover_class[18]]
    green_urban_area = [landcover_class[9], landcover_class[19]]
    natural_area = [landcover_class[0], landcover_class[8], landcover_class[10], landcover_class[16], landcover_class[20]]
    other = [landcover_class[1], landcover_class[13],landcover_class[14], landcover_class[17]]

    for i in list(df_landcover_percent):
        if i in urban_residential_area:
            df_landcover_percent = df_landcover_percent.rename(columns={i: 'Urban residential areas'})
        elif i in road_railway:
            df_landcover_percent = df_landcover_percent.rename(columns={i: 'Roads and railways'})
        elif i in green_urban_area:
            df_landcover_percent = df_landcover_percent.rename(columns={i: 'Green urban areas (including Sports and leisure facilities)'})
        elif i in natural_area:
            df_landcover_percent = df_landcover_percent.rename(columns={i: 'Natural areas (Arable land, forests, herbaceous vegetation associations, pastures and water)'})
        elif i in other:
            df_landcover_percent = df_landcover_percent.rename(columns={i: 'Other (Construction sites, land without current use, mineral extraction and dump sites and port areas)'})

    landcover_percent_merge = df_landcover_percent.groupby(df_landcover_percent.columns, axis=1).sum()
    landcover_percent_merge = landcover_percent_merge[['siteId', 'Industrial, commercial, public, military and private units',
                                                             'Roads and railways', 'Urban residential areas',
                                                             'Green urban areas (including Sports and leisure facilities)',
                                                             'Natural areas (Arable land, forests, herbaceous vegetation associations, pastures and water)',
                                                             'Other (Construction sites, land without current use, mineral extraction and dump sites and port areas)']]
    
    return landcover_percent_merge

### 1.4 Get the degree of mixing land use

In [16]:
def get_landuse_mix(landcover_percent_merge):

    landcover_percent_merge['landuse_mix'] = 0
    
    for i in range(len(landcover_percent_merge)):
        landuse_percent = landcover_percent_merge.iloc[i,1:].tolist()
        landuse_percent = [x for x in landuse_percent if x != 0]
        landuse_mix = entropy(landuse_percent)/np.log(len(landuse_percent))
        landcover_percent_merge.loc[i, 'landuse_mix'] = landuse_mix
    
    all_landcover_percent_merge = landcover_percent_merge.fillna(0)
    return all_landcover_percent_merge

### 1.5 Get statistics

In [18]:
def get_sta(df_landcover_percent, buffer_size):
    landcover_sta = df_landcover_percent.describe().T[['mean','std','min','max']]
    landcover_sta = landcover_sta.add_suffix('_' + buffer_size)
    return landcover_sta

## 2. Implement

### 2.1 Get landcover percent of all buffer size

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    landcover_intersect = read_landcover_intersect_file('useful_detector_buffer_landcover_intersection_' + doc)
    df_landcover_percent = get_landcover_percent(landcover_intersect)
    landcover_percent_merge = merge_landcover_class(df_landcover_percent)
    all_landcover_percent_merge = get_landuse_mix(landcover_percent_merge)
    
    site_list =  all_landcover_percent_merge.siteId.tolist()
    all_landcover_percent_merge = all_landcover_percent_merge.drop('siteId', axis=1).add_suffix('_' + doc)
    
    df_list.append(all_landcover_percent_merge)

all_landcover = pd.concat(df_list, axis=1)
all_landcover.insert(loc=0, column='siteId', value=site_list)
all_landcover

#all_landcover.to_csv('all_buffer_landcover_percent_new.csv', index = False)

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    landcover_intersect = read_landcover_intersect_file('useful_detector_buffer_landcover_intersection_' + doc)
    df_landcover_percent = get_landcover_percent(landcover_intersect)
    landcover_percent_merge = merge_landcover_class(df_landcover_percent)
    all_landcover_percent_merge = get_landuse_mix(landcover_percent_merge)
    landcover_sta = get_sta(all_landcover_percent_merge, doc)
    df_list.append(landcover_sta)

all_sta = pd.concat(df_list, axis=1)
all_sta

## Road link

## 1. Make defination 

### 1.1 Read road intersect data

In [3]:
def read_road_intersect_file(buffer_road_intersect_csv):
    road_link_intersect = gpd.read_file("../20211028_SCOOT_Data/SCOOT_data_points-shp/Spatial Join/" + buffer_road_intersect_csv + ".csv")
    return road_link_intersect

### 1.2 Get road density(km/sq.km) of each road group

In [120]:
def get_road_density(road_link_intersect):     
    
    sites = list(set(road_link_intersect.siteId))

    df_list = []
    for site in sites:
        each_site_road = road_link_intersect.loc[road_link_intersect['siteId'] == site].loc[:,['routeHiera','LENGTH']]
        road_dict = dict(each_site_road.values)
        road_dict['siteId'] = site
        road_df = pd.DataFrame(road_dict, index=[site])
        df_list.append(road_df) 

    df_road_percent = pd.concat(df_list, ignore_index=True, sort=False).fillna(0)

    #change the column order
    site_list = df_road_percent.siteId.tolist()
    df_road_percent = df_road_percent.drop('siteId', axis=1).astype(float) #convert to float
    df_road_percent = df_road_percent/(float(road_link_intersect.BUFF_DIST[0])**2*3.14/1000)
    df_road_percent.insert(loc=0, column='siteId', value=site_list)
    
    
    #merge the road into group
    df_road_percent['motorway(km/sq.km)']=df_road_percent['Motorway']
    df_road_percent['majorRoad(km/sq.km)']=df_road_percent['A Road']+df_road_percent['A Road Primary']
    df_road_percent['secondaryRoad(km/sq.km)']=df_road_percent['B Road']+df_road_percent['Minor Road']
    df_road_percent['localRoad(km/sq.km)']=df_road_percent['Local Road']+df_road_percent['Local Access Road']+df_road_percent['Restricted Local Access Road']+df_road_percent['Secondary Access Road']+df_road_percent['Restricted Secondary Access Road']
    df_road_density = df_road_percent[['siteId','motorway(km/sq.km)','majorRoad(km/sq.km)','secondaryRoad(km/sq.km)','localRoad(km/sq.km)']]

    df_road_density = df_road_density.sort_values(by=['siteId']).reset_index(drop=True)
    
    return df_road_density

### 1.3 Get statistics

In [127]:
def get_sta(df_road_density,buffer_size):
    road_sta = df_road_density.describe().T[['mean','std','min','max']]
    road_sta = road_sta.add_suffix('_' + buffer_size)
    return road_sta

## 2. Implement

### 2.1 Get road density of all buffer size

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    road_link_intersect = read_road_intersect_file('road_intersection_' + doc)
    df_road_density = get_road_density(road_link_intersect)
    
    site_list =  df_road_density.siteId.tolist()
    df_road_density = df_road_density.drop('siteId', axis=1).add_suffix('_' + doc)
    
    df_list.append(df_road_density)

all_road = pd.concat(df_list, axis=1)
all_road.insert(loc=0, column='siteId', value=site_list)
all_road

#all_road.to_csv('all_buffer_road_percent.csv', index = False)

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    road_link_intersect = read_road_intersect_file('road_intersection_' + doc)
    df_road_density = get_road_density(road_link_intersect)
    road_sta = get_sta(df_road_density, doc)
    df_list.append(road_sta)

all_sta = pd.concat(df_list, axis=1)
all_sta

## Census

## 1. Make defination 

### 1.1 Read census csv & Reconstruct the dataframe

In [369]:
def reconstruct_census_csv(path):    
    #get output area index
    index = pd.read_csv('census2011_csv/Index/OA_TO_HIGHER_AREAS.csv')[['OutputArea2011Code', 'CouncilArea2011Code']]
    GCC_index = index.loc[index['CouncilArea2011Code'] == 'S12000046'].reset_index(drop=True) # S12000046 for GCC area
    GCC_index_list = GCC_index.OutputArea2011Code.tolist()

    directory = os.fsencode(path)
    all_census = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)

        if filename.endswith(".csv"): 
            #print(filename)

            census_csv = pd.read_csv(path + filename)
            census_csv.iloc[3,0] = census_csv.iloc[0,0]
            census_csv.columns = census_csv.iloc[3]
            census_csv = census_csv.add_prefix(census_csv.columns[0].split('-')[1]+': ')
            census_csv = census_csv.rename(columns={census_csv.columns[0]: 'OutputArea2011Code'})

            df_list = []
            for each_index in GCC_index_list:
                GCC_census_each = census_csv.loc[census_csv['OutputArea2011Code'] == each_index]
                df_list.append(GCC_census_each)

            GCC_census = pd.concat(df_list).reset_index(drop=True)
            all_census.append(GCC_census)

    dfs = [df.set_index('OutputArea2011Code') for df in all_census]
    all_census_csv = pd.concat(dfs, axis=1)
    all_census_csv = all_census_csv.replace('-', 0).replace({',': ''}, regex=True)._convert(numeric=True)
    all_census_csv = all_census_csv.rename_axis("code").reset_index()
    
    return all_census_csv

### 1.2 Get the useful census

In [185]:
def get_useful_census(all_census_csv):    
    
    useful_census_csv = all_census_csv[['code',' Age structure: Mean age',' Age structure: Median age',
                    ' Usual resident population: All people',' Usual resident population: Males',' Ethnic group: White',
                    ' Qualifications and students: All people aged 16 and over: Highest level of qualification: Level 4 qualifications and above',
                    ' Distance travelled (1) to work: All people aged 16 to 74 in employment',
                    ' Method of travel to work or study (1): Driving a car or van',' Method of travel to work or study (1): Passenger in a car or van',
                    ' Method of travel to work or study (1): Bicycle',' Method of travel to work or study (1): On foot']]
    useful_census_csv[' Method of travel to work or study (1): Car or van'] = useful_census_csv[' Method of travel to work or study (1): Driving a car or van']+useful_census_csv[' Method of travel to work or study (1): Passenger in a car or van']
    useful_census_csv = useful_census_csv.drop([' Method of travel to work or study (1): Driving a car or van',' Method of travel to work or study (1): Passenger in a car or van'], axis=1)
    
    OA_shp = gpd.read_file("census2011_shp/output-area-2011-eor/OutputArea2011_EoR_GCC.shp")[['code','SHAPE_1_Ar']]
    useful_census_csv_area = pd.merge(OA_shp, useful_census_csv,how="right", on='code')
    useful_census_csv_area.iloc[:,4:] = useful_census_csv_area.iloc[:,4:].div(useful_census_csv_area.SHAPE_1_Ar, axis=0)
    
    return useful_census_csv_area

### 1.3 Read output area intersect (detetor buffer) data

In [6]:
def read_OA_intersect_file(buffer_OA_intersect_csv):
    OA_link_intersect = gpd.read_file("../20211028_SCOOT_Data/SCOOT_data_points-shp/Spatial Join/" + buffer_OA_intersect_csv + ".csv")
    return OA_link_intersect

### 1.4 Merge useful census data to intersect csv and calculate each percent

In [211]:
def get_census_percent(OA_link_intersect, useful_census_csv_area):    
    census_intersect = pd.merge(OA_link_intersect[['siteId','code','AREA','PERCENTAGE']], useful_census_csv_area,how="left", on='code')
    census_intersect.iloc[:,5:7] = census_intersect.iloc[:,5:7].multiply(census_intersect["PERCENTAGE"]._convert(numeric=True),axis="index")/100
    census_intersect.iloc[:,7:] = census_intersect.iloc[:,7:].multiply(census_intersect["AREA"]._convert(numeric=True),axis="index")

    sites = list(set(OA_link_intersect.siteId))
    df_list = []
    for site in sites:
        #print(site)
        each_census_sum = census_intersect.loc[census_intersect.siteId == site].iloc[:,5:].sum()
        each_census_df = each_census_sum.to_frame().T  #convert series to df
        each_census_df.insert(loc=0, column='siteId', value=[site])

        #calculate the percentage of each column within each census group
        each_census_df['mean_age'] = each_census_df[' Age structure: Mean age']
        each_census_df['median_age'] = each_census_df[' Age structure: Median age']
        each_census_df['population_density(persons/sq.km)'] = each_census_df[' Usual resident population: All people']/(float(OA_link_intersect.BUFF_DIST[0])**2*3.14/1000000)
        each_census_df['employment_density(persons/sq.km)'] = each_census_df[' Distance travelled (1) to work: All people aged 16 to 74 in employment']/(float(OA_link_intersect.BUFF_DIST[0])**2*3.14/1000000)
        each_census_df['male_percent'] = each_census_df[' Usual resident population: Males']/each_census_df[' Usual resident population: All people']
        each_census_df['white_percent'] = each_census_df[' Ethnic group: White']/each_census_df[' Usual resident population: All people']
        each_census_df['collegeDegree_percent'] = each_census_df[' Qualifications and students: All people aged 16 and over: Highest level of qualification: Level 4 qualifications and above']/each_census_df[' Usual resident population: All people']
        each_census_df['bicycle_percent(commute)'] = each_census_df[' Method of travel to work or study (1): Bicycle']/each_census_df[' Usual resident population: All people']
        each_census_df['walk_percent(commute)'] = each_census_df[' Method of travel to work or study (1): On foot']/each_census_df[' Usual resident population: All people']
        each_census_df['car_percent(commute)'] = each_census_df[' Method of travel to work or study (1): Car or van']/each_census_df[' Usual resident population: All people']
        each_census_percent = each_census_df[['siteId', 'mean_age', 'median_age', 'population_density(persons/sq.km)', 'employment_density(persons/sq.km)', 
                 'male_percent', 'white_percent', 'collegeDegree_percent', 'bicycle_percent(commute)', 
                 'walk_percent(commute)', 'car_percent(commute)']]

        df_list.append(each_census_percent)

    df_census_percent = pd.concat(df_list)
    df_census_percent = df_census_percent.sort_values(by=['siteId']).reset_index(drop=True)
    
    return df_census_percent

### 1.5 Get statistics

In [215]:
def get_sta(df_census_percent, buffer_size):
    census_sta = df_census_percent.describe().T[['mean','std','min','max']]
    census_sta = census_sta.add_suffix('_' + buffer_size)
    return census_sta

## 2. Implement

### 2.1 Get census of all buffer size

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []

all_census_csv = reconstruct_census_csv('census2011_csv/useful_data/')
useful_census_csv_area = get_useful_census(all_census_csv)

for doc in doc_list:
    OA_link_intersect = read_OA_intersect_file('census_OA_intersection_' + doc)
    df_census_percent = get_census_percent(OA_link_intersect, useful_census_csv_area)
    
    site_list =  df_census_percent.siteId.tolist()
    df_census_percent = df_census_percent.drop('siteId', axis=1).add_suffix('_' + doc)
    
    df_list.append(df_census_percent)

all_census = pd.concat(df_list, axis=1)
all_census.insert(loc=0, column='siteId', value=site_list)
all_census

#all_census.to_csv('all_buffer_census_percent.csv', index = False)

In [None]:
doc_list = ['100', '200', '300', '400']
df_list = []
for doc in doc_list:
    OA_link_intersect = read_OA_intersect_file('census_OA_intersection_' + doc)
    df_census_percent = get_census_percent(OA_link_intersect, useful_census_csv_area)
    census_sta = get_sta(df_census_percent, doc)
    df_list.append(census_sta)

all_sta = pd.concat(df_list, axis=1)
all_sta