# EDA

In [None]:
import pandas as pd
from data_handler import  get_data, get_gdf

df = get_data()
display(df.head())

print(f'Number of Groups: {df.group_code.nunique()}')
print(f'Number of Categories: {df.category_code.nunique()}')
print(f'Number of Classes: {df.pointx_class_code.nunique()}')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pointx_class = df['pointx_class']
pointx_class_counts = pointx_class.value_counts()
pointx_class_counts = pointx_class_counts.sort_values(ascending=False)
x = list(range(1, len(pointx_class_counts) + 1))
y1 = pointx_class_counts
y2 = np.log(pointx_class_counts)

fig, axs = plt.subplots(1,2,figsize=(20, 8))

sns.scatterplot(x=x, y=y1, marker='o', ax=axs[0], s=150)
sns.scatterplot(x=x, y=y2, marker='o', ax=axs[1], s=150)
axs[0].set_xlabel('Rank')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Rank vs. Frequency of pointx_class')
axs[0].grid(True)
axs[1].set_xlabel('Rank')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Rank vs. Frequency of pointx_class Log Scale')
axs[1].grid(True)

for i in range(4):
    axs[0].arrow(x[i]          , y1[i], (i+1)*40, 0, head_width=3, color='black')
    axs[0].text((i+1)*40 + x[i], y1[i], y1.index[i], fontsize=14) 
    axs[1].arrow(x[i]          , y2[i], (i+1)*40, 0, head_width=0.08, color='black')
    axs[1].text((i+1)*40 + x[i], y2[i], y2.index[i], fontsize=14) 


plt.show()



Goal: Generate random points with a minimal interval to construct POI sequences in Exeter. 

- These random points are generated along with the road network (excluding unclassified roads).
- The distance interval between any two random points is 50 metres.
- These constraints produce a structured set of random points R {r1, r2, ..., ri, ..., rn}
- The maximum size possible for the set R is N.
- For each random point ri, we search its accessible POIs within 200 metres and thus obtain an accessible POI set Si. 
- Compute the distance for all of the pairs between the central random point and POIs within Si.
- Use the distance as a reference by which build a sequentially ordered POI list Li = [poi1, poi2, poi3, ..poin],where i refers to the index of the random point, n is the number of accessible POIs around the random point ri and elements in the list refer to the POI classes. 

# Data Preprocessing

In [None]:
from analysis import get_poi_sequences
from vis import full_sequence_map, sample_sequence_map

full_sequence_gdf = get_poi_sequences()
full_sequence_map(full_sequence_gdf)

# Data Loader

In [None]:
from dataloader import get_dataloader

dl = get_dataloader(context_length=128, bs=16, shuffle=True)
x, y = next(iter(dl))
print(x.shape, y.shape)

# Doc2Vec

In [None]:
import torch
from doc2vec import  main, save_checkpoint
pois_dataset_args = {'force_recreate': False, 'max_sequence_size': 200}
dataloader_args = {'batch_size': 1024, 'num_workers': 0, 'shuffle': True, 'drop_last': True}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 10000
lr = 0.01
min_delta = None
vec_dim = 20
vocab_min_count=0
n_negative_samples=5
context_size=10
concat = False

model, training_losses, vocab, ds = main(
    dataset_args=pois_dataset_args, dataloader_args=dataloader_args, concat=concat,
    vec_dim = vec_dim,  vocab_min_count=vocab_min_count, n_negative_samples=n_negative_samples, 
    context_size=context_size, device=device, epochs=epochs, lr=lr, min_delta=min_delta)

In [None]:
checkpoint_path =  f'models/doc2vec_checkpoint.pt'
ckpt_path = save_checkpoint(model=model, training_losses=training_losses, vocab=vocab, ds=ds, filename=checkpoint_path, add_timestamp=True)

# Evaluation:

In [None]:
from doc2vec import  load_checkpoint
import pandas as pd
import os
import time

#select the latest checkpoint
ckpt_path = max([f'models/{f}' for f in os.listdir('models') if f.startswith('doc2vec_checkpoint')])
print(ckpt_path)

model, training_losses, vocab, ds = load_checkpoint(ckpt_path)

seqs = []
paragraphs = []
seq_ids = []
for item in ds:
    seqs.append(item['seq'])
    paragraphs.append(item['paragraph'])
    seq_ids.append(item['seq_id'])

seqs = pd.DataFrame({'seq': seqs, 'paragraph': paragraphs, 'seq_id': seq_ids})

paragraph_embeddings = model.paragraph_matrix.to('cpu').detach().numpy()
word_embeddings = model.word_matrix.to('cpu').detach().numpy()
words = vocab.words
paragraphs = vocab.paragraphs
words2idx = vocab.word2idx
paragraphs2idx = vocab.paragraph2idx



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

word_sim = cosine_similarity(word_embeddings, word_embeddings)


fig, ax = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(word_sim, ax=ax, cmap='RdBu', center=0)

In [None]:
word_embeddings.shape

In [None]:
from sklearn.cluster import AgglomerativeClustering
n_clusters = 9
linkage = 'complete'
metric = 'cosine'
compute_full_tree = True
compute_distances = True

model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, metric=metric, 
                                compute_full_tree=compute_full_tree, compute_distances=compute_distances)

model = model.fit(word_embeddings)

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
    dendrogram(linkage_matrix, **kwargs)

plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()



In [None]:



def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


iris = load_iris()
X = iris.data

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
X.shape