In [9]:
!pip install -r requirements.txt
import datatable as dt
import datetime
import glob
import googlemaps
import json
import matplotlib.pyplot as plt
import pandas as pd
import requests
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import KMeans, DBSCAN
from sklearn_extra.cluster import KMedoids
from plotly.subplots import make_subplots
from sklearn.neighbors import NearestNeighbors
from datetime import datetime
from pathlib import Path
import os
import pyprojroot
os.chdir(pyprojroot.here())

# Resolves "no renderer" error
# pio.renderers.default = 'iframe_connected'
# Default Matplotlib Plot Settings
graph_style = "ggplot"
graph_size = (25, 15)

# Reads in api key from text file called "api_key.txt" in local directory
API_KEY = Path("api_key.txt").read_text()
# Initialise googlemaps API client given the API_KEY
client = googlemaps.Client(API_KEY)

# Miscellaneous functions
# https://www.geeksforgeeks.org/adding-value-labels-on-a-matplotlib-bar-chart/
def addlabels(x, y):
    """Adds labels to center bars of matplotlib barplots

    Args:
        x (float)): x coord of element we want to label
        y (float): y coord of element we want to label
    """
    for i in range(len(x)):
        plt.text(i, y[i] // 2, y[i], ha="center", fontsize="xx-large")




# Downloading / Cleaning Postcode Area Data

Created several functions for downloading any postcode area's data from doogal uk.


In [10]:
def download_postcode_area_data(area):
    """Downloads postcode area data and save as file to local dir

    Args:
        area (str): The postcode area we want to download information for
    """
    # Initialise file directory / name
    file_path = f"data/{area}_postcodes.csv"
    # If file containing this postcode area's data doesn't already exist
    if not Path(file_path).is_file():
        # Get request with URL
        req = requests.get(
            f"https://www.doogal.co.uk/UKPostcodesCSV.ashx?Search={area}"
        )
        # Create new file in local dir
        csv_file = open(f"data/{area}_postcodes.csv", "wb")
        # Write get request's csv content to file
        csv_file.write(req.content)
        # Close file so it's no longer loaded in memory and is saved in storage
        csv_file.close()


def download_london_postcode_data():
    """Downloads london data and save as file to local dir"""
    # Get London Postcodes
    file_path = "data/london_postcodes.csv"
    if not Path(file_path).is_file():
        req = requests.get(f"https://www.doogal.co.uk/UKPostcodesCSV.ashx?area=London")
        url_content = req.content
        postcode_content.append(url_content)
        csv_file = open(f"data/london_postcodes.csv", "wb")
        csv_file.write(url_content)
        csv_file.close()


def combine_postcode_area_data():
    """Combines all postcode area files into a single file"""
    if Path("combined_postcodes_csv.csv").is_file():
        print("Reading in global csv")
        combined_csv = dt.fread("combined_postcodes_csv.csv").to_pandas()
    else:
        # Read all postcode csv files
        all_filenames = [i for i in glob.glob("*_postcodes.{}".format("csv"))]
        # combine all files in the list
        combined_csv = pd.concat([dt.fread(f).to_pandas() for f in all_filenames])
        # export to csv
        dataframes = []
        for file in all_filenames:
            dataframes.append(dt.fread(f"{file}").to_pandas())
        combined_csv = pd.concat(dataframes)
        combined_csv.to_csv(
            "combined_postcodes_csv.csv", index=False, encoding="utf-8-sig"
        )


def read_postcode_area_data(area):
    """Reads postcode area data directly from URL and returns the dataframe
    If postcode area file doesn't exist, it downloads it and reads it into memory then returns it
    Args:
        area (str)): postcode area

    Returns:
        pd.DataFrame: postcode area data
    """
    # Location of postcode area data in local dir
    file_path = f"data/{area}_postcodes.csv"
    # If file doesn't exist, download it and save to storage
    if not Path(file_path).is_file():
        # Download postcode area data
        if area == "london":
            download_postcode_area_data()
        else:
            download_postcode_area_data(area)
    print(f"[*]\tPostcode area '{area}' data loaded into memory")
    # Load postcode area data from storage into memory and return it
    return pd.read_csv(f"data/{area}_postcodes.csv")


def clean_data(df):
    """Data preprocessing/cleaning of postcode area data
    - Removes rows with invalid/useless longitude/latitude values
    - Removes rows with postcodes that are no longer in use (Assuming it's because the locations are used by other postcodes, deprecating old ones)
    Args:
        df (pd.DataFrame): postcode area data

    Returns:
        pd.DataFrame: Refined/cleaned postcode area data
    """
    # print(df.shape)

    # Replace lat/long values that equal 0 with null values
    df["Longitude"] = df["Longitude"].replace(0, np.nan)
    df["Latitude"] = df["Latitude"].replace(0, np.nan)
    # Remove any and all rows with null long/lat values
    df.dropna(axis=0, how="any", subset=["Latitude", "Longitude"], inplace=True)

    # NOTE: Commented out due to discoveries made in EDA (Despite this purging thousands of rows from the data)
    # Replace NaN objects in 'Terminated' with empty strings (If empty, the postcode is still in use)
    # df['Terminated'] = df["Terminated"].fillna("")
    # Filters dataframe based on 'Terminated' column as "" values mean the postcode is still valid/in use
    # df = df[df["Terminated"] == ""]

    # print(df.shape)
    return df


def get_address_long_lat(address):
    """Gets adress's longitude and latitude values via Google Maps API (Enable Geocoding API)

    Args:
        address (string): Address

    Returns:
        dict: Longitude and latitude values of address
    """
    response = requests.get(
        f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={API_KEY}"
    )
    data = json.loads(response.content)
    location = data.get("results")[0].get("geometry").get("location")
    return location


def get_distances_to_from_locations(postcode_area_data, to_postcode):
    """Gets distances and travel durations from each postcode in postcode area data to the 'to_postcode'

    Args:
        postcode_area_data (pd.DataFrame): postcode area data
        to_postcode (str): destination
    """
    sample = postcode_area_data
    now = datetime.now()
    results = []
    # No representative datapoints
    for index, row in sample.iterrows():
        from_postcode = f"{row['Latitude']}, {row['Longitude']}"
        travel_methods = ["driving", "transit", "walking"]
        distance = ""
        travel_times = []
        travel_distances = []
        for travel_method in travel_methods:
            journey = client.directions(
                from_postcode, to_postcode, mode=travel_method, departure_time=now
            )
            # Extract distance and time for the travel_method specified
            distance = journey[0]["legs"][0]["distance"]["text"]
            travel_time = journey[0]["legs"][0]["duration"]["text"]
            # Append time/distance to corresponding lists for retrieval later
            travel_times.append(travel_time)
            travel_distances.append(distance)

        # Initialise appropriate values for table
        postcode = row["Postcode"]
        drive_time = travel_times[0]
        transit_time = travel_times[1]
        walking_time = travel_times[2]
        drive_distance = travel_distances[0]
        transit_distance = travel_distances[1]
        walking_distance = travel_distances[2]

        # Create temporary dataframe consisting of a single row using the above data
        temp = pd.DataFrame(
            {
                "from_postcode": postcode,
                "driving_duration": drive_time,
                "driving_distance": drive_distance,
                "transit_time": transit_time,
                "transit_distance": transit_distance,
                "walking_time": walking_time,
                "walking_distance": walking_distance,
            },
            index=[0],
        )
        # Append single row dataframe to list
        results.append(temp)
    # Concatenate / Join single row dataframes to create one big dataframe
    df = pd.concat(results).reset_index()
    del df["index"]
    df.to_csv("data/first_five_postcodes.csv", index=False)
    display(df)


# Downloading RH Postcode Data (Use a for loop over a list of postcode areas for all areas)
# Read in postcode area
postcode_area_data = read_postcode_area_data("RH")
# Cleans data
postcode_area_data = clean_data(postcode_area_data)
# Creates file "first_five_postcodes.csv" containing the travel data via google maps api
to_postcode = get_address_long_lat("Whiteknights RG6 6AH")
# Creates a sample table using first 5 postcodes in postcode area data to test google maps api
get_distances_to_from_locations(postcode_area_data.head(), to_postcode)


[*]	Postcode area 'RH' data loaded into memory


Unnamed: 0,from_postcode,driving_duration,driving_distance,transit_time,transit_distance,walking_time,walking_distance
0,RH1 1AA,1 hour 2 mins,71.1 km,3 hours 17 mins,108 km,13 hours 59 mins,67.7 km
1,RH1 1AB,1 hour 1 min,71.0 km,3 hours 18 mins,108 km,13 hours 54 mins,67.3 km
2,RH1 1AD,1 hour 3 mins,71.4 km,3 hours 20 mins,108 km,14 hours 0 mins,67.8 km
3,RH1 1AE,1 hour 4 mins,71.4 km,3 hours 20 mins,108 km,14 hours 1 min,67.9 km
4,RH1 1AF,1 hour 4 mins,71.8 km,3 hours 25 mins,109 km,14 hours 3 mins,68.0 km


# Exploratory Data Analysis

From checking the postcode area data and the Dataset Description: https://www.doogal.co.uk/PostcodeCsvFields.php, I've identified the columns that are the most useful to me for the task of checking travel distance / duration to whiteknights campus from each postcode.

- In Use : Refers to whether the postcode is in use and links with a couple other columns, one of which is 'Terminated'.
- Terminated : Indicates whether a postcode is still in use, if it's not, I've made the assumption that it's been replaced or made redundant by surrounding postcodes within close proximity or in the same place.
- Longitude/Latitude : The coordinate values representing the location of each postcode which will be essential when retrieving distances to whiteknights campus


In [11]:
def plot_top_population_per_postcode(df):
    """Plots the five postcodes with highest populations, largest to smallest (I'll admit, a pointless plot for the task set but leaving it in anyway)"""
    # Check how many areas have population data
    total_records = df["Population"].shape[0]

    # Filter records to only store those with valid populations
    df = df[df["Population"].notna()]

    print(
        f"There are {df['Population'].astype(int).shape[0]}/{total_records} records with valid population data"
    )
    # Print the top 5 records with the highest populations
    # Convert population column value datatype to integers from object/string
    df["Population"] = df["Population"].astype(int)
    # Sort records in dataframe by population value in descending order
    df = df.sort_values("Population", ascending=False)
    x = df["Postcode"].head(5)
    y = df["Population"].head(5).tolist()
    plt.figure(figsize=graph_size)
    plt.bar(x, y)
    plt.title("Populations per postcode")
    plt.xlabel("Postcodes")
    plt.ylabel("Population")
    # plt.grid(visible=False)
    addlabels(x, y)
    plt.show()


def add_terminated_postcodes_plot(df, area, showlegend):
    """Plots the postcodes that have been terminated

    Args:
        df (pd.DataFrame): Uncleaned postcode area data
        area (str): Postcode area
    """
    # Partial cleaning on temporary dataframe
    # Filter data to only include terminated postcodes
    df = df[df["Terminated"] != ""]
    # Replace lat/long values that equal 0 with null values
    df["Longitude"] = df["Longitude"].replace(0, np.nan)
    df["Latitude"] = df["Latitude"].replace(0, np.nan)
    # Remove any and all rows with null long/lat values
    df.dropna(axis=0, how="any", subset=["Latitude", "Longitude"], inplace=True)

    # Initialise longitude/latitude values
    longitudes = df["Longitude"].tolist()
    latitudes = df["Latitude"].tolist()
    trace = go.Scatter(
        x=longitudes,
        y=latitudes,
        name="Terminated postcodes",
        mode="markers",
        legendgroup="Terminated",
        showlegend=showlegend,
        opacity=0.75,
        text=df["Postcode"],
        marker=dict(
            symbol="x",
            color="Red",
        ),
    )
    return trace


def add_postcode_area_long_lat_plot(df, area, showlegend):
    """Plots longitude and latitude coordinates of each postcode on scatter plot

    Args:
        df (pd.DataFrame): postcode area data
        area (str): postcode area
    """
    # Initialise longitude/latitude values
    longitudes = df["Longitude"].tolist()
    latitudes = df["Latitude"].tolist()

    trace = go.Scatter(
        x=longitudes,
        y=latitudes,
        name="Active postcodes",
        legendgroup="Active",
        showlegend=showlegend,
        mode="markers",
        text=df["Postcode"],
        opacity=0.7,
        marker=dict(
            symbol="circle-open",
            color="Green",
        ),
    )

    return trace


df = pd.read_csv("data/RH_postcodes.csv")
fig = make_subplots(
    rows=3,
    cols=2,
    specs=[
        [{}, {}],
        [{"rowspan": 2, "colspan": 2}, None],
        [None, None],
    ],
)
showlegend = True
terminated = add_terminated_postcodes_plot(df, "RH", showlegend)

df = clean_data(df)
# plot_top_population_per_postcode(df)
active = add_postcode_area_long_lat_plot(df, "RH", showlegend)

trace1 = add_terminated_postcodes_plot(df, "RH", showlegend)
trace2 = add_postcode_area_long_lat_plot(df, "RH", showlegend)

fig.add_trace(terminated, row=1, col=1)
fig.add_trace(active, row=1, col=2)
fig.add_trace(terminated, row=2, col=1)
fig.add_trace(active, row=2, col=1)
# TODO: Have single legends for active and terminate postcodes
fig.update_layout(
    height=800, width=1200, title_text="Terminated postcodes against used postcodes"
)
fig.show()


Originally I was intending to remove any and all terminated postcodes from the dataset. However, upon zooming into the plot containing both terminated and active postcodes, it's obvious there are thousands of 'Terminated' postcodes that are in locations far away from 'active' postcodes. This makes these areas valid locations unless these locations are occupied by some other form of postcode... Therefore I will be keeping the terminated postcodes in use as travelling from these locations is still viable whether they're used as a means to identify the location of housing or not.


In [21]:
def plot_elbow_curve(df, postcode_area):
    # Refine dataframe to only include lat/long columns
    X = df.loc[:, ["Latitude", "Longitude"]]
    # Elbow curve for optimal number of clusters for kmeans
    K_clusters = range(1, 100)
    kmeans = [KMeans(n_clusters=i) for i in K_clusters]
    Y_axis = df[["Latitude"]]
    X_axis = df[["Longitude"]]
    score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
    # Visualize
    plt.style.use(graph_style)
    plt.plot(K_clusters, score)
    plt.xlabel(f"Elbow Curve : {postcode_area}")
    plt.ylabel("Score")
    plt.title("Elbow Curve")
    plt.show()


# https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
def plot_epsilon_curve(X):
    """Plots a curve indicating the optimal epsilon value for DBSCAN at the sharpest point

    Args:
        X (pd.DataFrame): postcode area data
    """
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(df[['Longitude', 'Latitude']])
    distances, indices = nbrs.kneighbors(df[['Longitude', 'Latitude']])
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    plt.plot(distances)

    
def plot_data_kmeans(X, postcode_area):
    clusters = 3
    # Apply k means clustering
    kmeans = KMeans(n_clusters=clusters, init="k-means++")
    kmeans.fit(X[X.columns[1:clusters]])  # Compute k-means clustering.
    X["cluster_label"] = kmeans.fit_predict(X[X.columns[1:clusters]])
    centers = kmeans.cluster_centers_  # Coordinates of cluster centers.
    labels = kmeans.predict(X[X.columns[1:clusters]])  # Labels of each point
    print(X.head(10))

    plt.figure(figsize=graph_size)
    plt.title(f"K-Means : {graph_style}")
    plt.style.use(graph_style)
    plt.grid(b=None)
    # # X.plot.scatter(x = 'Latitude', y = 'Longitude', c=labels, s=50, cmap='viridis')
    plt.title(f"K-Means Clustering  : Postcode area: {postcode_area}")
    plt.xlabel("Latitude")
    plt.ylabel("Longitude")
    plt.scatter(
        X.Latitude,
        X.Longitude,
        c=labels,
        cmap="Set3",
        s=100,
        edgecolors="gray",
        linewidths=1,
    )

    plt.scatter(
        centers[:, 0],
        centers[:, 1],
        c="black",
        s=300,
        alpha=0.9,
        marker="o",
        edgecolors="gray",
        linewidths=10,
    )
    plt.show()


def plot_single_postcode_kmeans(df, postcode_area):
    """Plot a single postcode k-means plot

    Args:
        postcode_area (str): beginning of postcode
    """
    X = df[["Postcode", "Latitude", "Longitude"]]
    # Filter data to only contain RG1 Postcodes
    X = X[X.Postcode.str.contains(postcode_area)]
    # If there's no postcodes beginning with postcode_area, skip this loop
    if X.shape[0] == 0:
        return
    # Elbow plots
    # plot_elbow_curve(postcode_area)
    # plot_data_kmeans(X, postcode_area)
    plotly_postcode_area(X, postcode_area)


def plot_several_postcode_kmeans(postcode_areas: list):
    """Plot multiple k-means plots using postcode areas

    Args:
        postcode_areas (list): list of starter postcode data
    """
    for postcode_area in postcode_areas:
        plot_single_postcode_kmeans(postcode_area)


def plot_data_kmeans_with_style(X, postcode_area, style):
    # Apply k means clustering
    kmeans = KMeans(n_clusters=3, init="k-means++")
    kmeans.fit(X[X.columns[1:3]])  # Compute k-means clustering.
    X["cluster_label"] = kmeans.fit_predict(X[X.columns[1:3]])
    centers = kmeans.cluster_centers_  # Coordinates of cluster centers.
    labels = kmeans.predict(X[X.columns[1:3]])  # Labels of each point
    X.head(10)

    plt.figure(figsize=graph_size)
    plt.title(f"K-Means : {graph_style}")
    plt.style.use(style)
    plt.grid(b=None)
    # # X.plot.scatter(x = 'Latitude', y = 'Longitude', c=labels, s=50, cmap='viridis')
    plt.title(f"K-Means Clustering  : Style '{style}'")
    plt.xlabel("Latitude")
    plt.ylabel("Longitude")
    plt.scatter(X.Latitude, X.Longitude, c=labels, cmap="Set3", s=100)

    plt.scatter(centers[:, 0], centers[:, 1], c="black", s=300, alpha=0.9, marker="^")
    plt.show()


def plot_postcode_area_clusters(df, postcode_area, method):
    """Plot a single postcode areas DBSCAN / k-means plot
    Args:
        postcode_area (str): beginning of postcode
    """

    X = df[["Postcode", "Latitude", "Longitude"]]
    # Filter data to only contain RG1 Postcodes
    X = X[X.Postcode.str.contains(postcode_area)]
    # If there's no postcodes beginning with postcode_area, skip this loop
    if X.shape[0] == 0:
        return
    # Plot(ly) postcode area
    plotly_postcode_area(X, postcode_area, method)


def kmeans(X):
    kmeans = KMeans(n_clusters=3, init="k-means++")
    kmeans.fit(X[["Longitude", "Latitude"]])  # Compute k-means clustering.
    X["cluster_label"] = kmeans.fit_predict(X[["Longitude", "Latitude"]])
    X["cluster_text"] = (
        "Cluster " + X["cluster_label"].astype(str) + "<br>" + X["Postcode"]
    )
    centers = kmeans.cluster_centers_  # Coordinates of cluster centers.
    labels = kmeans.predict(X[["Longitude", "Latitude"]])  # Labels of each point
    return X


def dbscan(X):
    model = DBSCAN(eps=0.0005, min_samples=2).fit(X[["Longitude", "Latitude"]])
    class_predictions = model.labels_
    X["CLUSTER_DBSCAN"] = class_predictions
    X["CLUSTER_DBSCAN_LABEL"] = (
        "Cluster " + X["CLUSTER_DBSCAN"].astype(str) + "<br>" + X["Postcode"]
    )
    return X


def plotly_postcode_area(X, area, method):
    # Apply k means clustering
    if method == "kmeans":
        X = kmeans(X)
        # Create figure with Kmeans
        fig = go.Figure(
            data=go.Scatter(
                x=X["Longitude"],
                y=X["Latitude"],
                text=X["cluster_text"],
                mode="markers",
                marker_symbol="circle-open",
                marker_color=X["cluster_label"],
            ),
            layout=go.Layout(
                width=800,
                height=800,
                paper_bgcolor="rgb(240,240,240)",  # transparent background
                plot_bgcolor="rgb(240,240,240)",  # transparent 2nd background
            ),
        )
    elif method == "DBSCAN":
        X = dbscan(X)
        # Create figure with DBSCAN
        fig = go.Figure(
            data=go.Scatter(
                x=X["Longitude"],
                y=X["Latitude"],
                text=X["CLUSTER_DBSCAN_LABEL"],
                mode="markers",
                marker_symbol="circle-open",
                marker_color=X["CLUSTER_DBSCAN"],
            ),
            layout=go.Layout(
                width=800,
                height=800,
            ),
        )

    fig.update_layout(
        height=600,
        width=800,
        margin={"r": 0, "t": 25, "l": 0, "b": 0},
        title=dict(
            text = f"Postcode Area: {area}",
            x = 0.5
        ),
        template="plotly_dark",
    )

    # fig.update_geos(fitbounds="locations")
    fig.show()


def identify_and_set_single_representative_datapoint(cluster, df):
    """Identifies and sets a single representative datapoint for a single cluster

    Args:
        cluster (int): cluster value
        df (pd.DataFrame): postcode dataframe

    Returns:
        pd.DataFrame: Dataframe containing representative datapoint
    """
    temp = df[df["CLUSTER_DBSCAN"] == cluster]
    x = temp["Latitude"].tolist()
    y = temp["Longitude"].tolist()
    GetMedoid = (
        lambda vX: KMedoids(n_clusters=1)
        .fit(temp[["Latitude", "Longitude"]])
        .cluster_centers_
    )
    # Got medoid
    medoid = GetMedoid([x, y])[0]
    # X.loc[(X["Latitude"] == medoid[0]) & (X["Longitude"] == medoid[1]), "Representative"]
    results = temp[temp["Latitude"] == medoid[0]]
    results = temp[temp["Longitude"] == medoid[1]]
    results = results.reset_index()
    results.loc[:0, "Representative"] = True

    # Where latitude and longitude values match medoid, replace those rows with results
    try:
        df.loc[
            (df["Latitude"] == medoid[0]) & (df["Longitude"] == medoid[1]),
            "Representative",
        ] = results["Representative"].tolist()
    except:
        print("Erro")
    return df


def set_all_representative_datapoints(df):
    """Sets all representative datapoints in postcode data

    Args:
        df (pd.DataFrame): postcode area data

    Returns:
        pd.DataFrame: postcode area data containing representative datapoints
    """
    df["Representative"] = False
    for i in df["CLUSTER_DBSCAN"].unique():
        # -1 represents a node that has no cluster - it's independent
        if i == -1:
            # Each independent postcode is representative - set it to True
            df.loc[(X["CLUSTER_DBSCAN"] == i), "Representative"] = True
            continue
        df = identify_and_set_single_representative_datapoint(i, df)
    # print(temp[temp['Representative'] != False].shape)
    print("[*]\tRepresentative data points set")
    return df


def plot_cluster_and_rep_point(cluster, df):
    """Plots the cluster and shows representative datapoint

    Args:
        cluster (int): The cluster value
        df (pd.DataFrame): Dataframe with all postcode data
    """
    df = df[df["CLUSTER_DBSCAN"] == cluster]
    df["CLUSTER_DBSCAN_LABEL"] = (
        "Cluster "
        + df["CLUSTER_DBSCAN"].astype(str)
        + "<br>"
        + df["Postcode"]
        + "<br>Representative Postcode: "
        + df["Representative"].astype(str)
    )

    fig = make_subplots(
        rows=1,
        cols=1,
        specs=[
            [{}],
        ],
    )
    temp = df[df["Representative"] == True]
    representative = go.Scatter(
            x=temp["Longitude"],
            y=temp["Latitude"],
            text=temp["CLUSTER_DBSCAN_LABEL"],
            mode="markers",
            name = "Representative Postcode",
            marker = dict(
                symbol = "x-thin-open",
                color = "blue",
                size = 16
            ),
        )
    temp = df[df["Representative"] == False] 
    non_representative = go.Scatter(
            x=temp["Longitude"],
            y=temp["Latitude"],
            text=temp["CLUSTER_DBSCAN_LABEL"],
            mode="markers",
            name = "Non-Representative Postcode",
            marker = dict(
                symbol = "circle-open",
                color = "black",
                size = 16
            ),
        )

    fig.add_trace(representative)
    fig.add_trace(non_representative)

    fig.update_layout(
            width=800,
            height=400,
            title = f"{len(df['CLUSTER_DBSCAN'])} postcodes : Cluster {cluster}",
            xaxis = dict(
                title = "Longitude",
                tickformat = ".3f"
            ),
            yaxis = dict(
                title = "Latitude",
                tickformat = ".3f"
            ),
            font=dict(
                family="Arial",
                size=18,
                color="Black"
            ),
        )
    fig.show()


def get_distances_to_from_locations_refined(postcode_area_data):
    """Creates a table filled with travel times for walking, car, transit transportation methods.
    Caters for representative datapoints - reducing number of requests to perform.
    Args:
        postcode_area_data (pd.DataFrame): Dataframe containing all postcode data
    """
    journey_data = []
    travel_methods = ["driving", "transit", "walking"]

    # Refine dataframe down to representative data points
    sample = postcode_area_data[postcode_area_data["Representative"] == True]

    # No representative datapoints
    print(f"Iterating through {len(sample)}")
    for index, row in sample.iterrows():
        now = datetime.now()
        coord = f"{row['Latitude']}, {row['Longitude']}"
        distance = ""
        travel_times = []
        travel_distances = []
        # For each of the three travel methods
        for travel_method in travel_methods:
            # Send request for travel time using the travel_method specified
            journey = client.directions(
                coord, "51.457625,-0.945636", mode=travel_method, departure_time=now
            )
            # Retrieve distance travelled and time taken to travel
            try:
                distance = journey[0]["legs"][0]["distance"]["text"]
                travel_time = journey[0]["legs"][0]["duration"]["text"]
            except:
                distance = 0
                travel_time = 0
            # Append times and distances to list
            travel_distances.append(distance)
            travel_times.append(travel_time)

        # Initialise variables for table's column values
        postcode = row["Postcode"]
        drive_time = travel_times[0]
        transit_time = travel_times[1]
        walking_time = travel_times[2]
        drive_distance = travel_distances[0]
        transit_distance = travel_distances[1]
        walking_distance = travel_distances[2]
        # TODO: Add different transportation distances?
        #
        postcode_journey_data = pd.DataFrame(
            {
                "from_postcode": postcode,
                "driving_duration": drive_time,
                "driving_distance": drive_distance,
                "transit_time": transit_time,
                "transit_distance": transit_distance,
                "walking_time": walking_time,
                "walking_distance": walking_distance,
            },
            index=[0],
        )
        journey_data.append(postcode_journey_data)
        end = datetime.now()
        print(end - now)

    # Create dataframe from list of journey data
    df = pd.concat(journey_data).reset_index()
    del df["index"]

    other_postcodes_journey_data = []
    # For each representative point
    for row in df.itertuples():
        # Identify cluster of representative point
        representative_postcode = postcode_area_data[
            postcode_area_data["Postcode"] == row.from_postcode
        ]
        cluster = representative_postcode.iloc[0]["CLUSTER_DBSCAN"]
        # Find other postcodes
        other_postcodes = postcode_area_data[
            postcode_area_data["CLUSTER_DBSCAN"] == cluster
        ]
        other_postcodes = other_postcodes["Postcode"]
        for postcode in other_postcodes:
            # Replace the other postcode's data with this data for walking,driving, bus times
            other_postcodes_journey_data.append(
                pd.DataFrame(
                    {
                        "from_postcode": postcode,
                        "distance": row.distance,
                        "driving_duration": row.driving_duration,
                        "transit_time": row.transit_time,
                        "walking_time": row.walking_time,
                    },
                    index=[0],
                )
            )

    result = pd.concat(other_postcodes_journey_data)
    result.append(df)
    # TODO: Replace rows where postcodes match that in results with values
    results.to_csv("qk007990_S2T3_RH.csv")


def plot_clusters_with_more_than_five_postcodes(data):
    """Plots clusters and their representative points with more than five postcodes in their cluster

    Args:
        X (pd.DataFrame): Processed and clustered postcode area data
    """
    data = data[data["Representative"] == True]
    for cluster in data["CLUSTER_DBSCAN"]:
        # Ignore clusters with less than 5 postcodes
        if len(data[data['CLUSTER_DBSCAN'] == cluster]) < 5:
            continue
        # Ignore cluster -1 : These are independent datapoints / postcodes
        if cluster == -1:
            continue
        plot_cluster_and_rep_point(cluster, data)


# Load in and clean RH postcode data
df = clean_data(pd.read_csv("data/RH_postcodes.csv"))
# print(len(df))
# # Perform DBSCAN clustering on dataframe
X = dbscan(df)
# # Create new column stating whether each postcode is representative of its cluster - set all to False
X["Representative"] = False
# # Set the representative boolean to true for each representative postcode in each cluster
X = set_all_representative_datapoints(X)

plotly_postcode_area(X, "RH", "DBSCAN")
# # Filter dataframe to only have representative datapoints

get_distances_to_from_locations_refined(X)
# Used to derive best graph
# plot_clusters_with_more_than_five_postcodes(X)
# Plot best representation of representative point in cluster of postcodes
# plot_cluster_and_rep_point(73, X)


Erro
Erro
Erro
Erro
Erro
Erro
Erro
Erro
Erro
Erro
Erro
[*]	Representative data points set


Iterating through 5842
0:00:00.689156
0:00:00.661175
0:00:00.564673
0:00:00.533118
0:00:00.587133
0:00:00.621281
0:00:00.598573
0:00:00.612098
0:00:00.574262
0:00:00.623038
0:00:00.566563
0:00:00.540117
0:00:00.643511
0:00:00.752366
0:00:00.617163
0:00:00.591809
0:00:00.605137
0:00:00.561856
0:00:00.623141
0:00:00.609826
0:00:00.669149
0:00:00.671005
0:00:00.633051
0:00:00.582120
0:00:00.659788
0:00:00.627124
0:00:00.604788
0:00:00.619962
0:00:00.597132
0:00:00.643941
0:00:00.640003
0:00:00.674993
0:00:00.650325
0:00:00.610591
0:00:00.611011
0:00:00.629863
0:00:00.624163
0:00:00.651956
0:00:00.646020
0:00:00.586958
0:00:00.596134
0:00:00.632860
0:00:00.701897
0:00:00.632197
0:00:00.691271
0:00:00.688568
0:00:00.651165
0:00:00.626782
0:00:00.641054
0:00:00.705045
0:00:00.608073
0:00:00.618954
0:00:00.596921
0:00:00.673035
0:00:00.587053
0:00:00.616023
0:00:00.720352
0:00:00.658614
0:00:00.611988
0:00:00.644013
0:00:00.586037
0:00:00.649986
0:00:00.571951
0:00:00.575035
0:00:00.588007
0: