# Adding transition probabilities to random walks

This notebook creates transition probability matrices based on user movement between pages for a *1)* directed graph, and an *2)* undirected graph. Performs repeated random walks using the probability matrices for both the directed and undirected graph, and saves both outputs as a csv file. 

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import randomwalks as rw
from scipy.sparse import csr_matrix

## Create transition matrix

### Directed graph

In [None]:
# Get directed graph
G = nx.read_gpickle("../../data/processed/functional_directed_graph_uk.gpickle")

# Create array with edge weight
T = nx.adjacency_matrix(G, weight="edgeWeight").todense()
T_array = np.array(T)

# Transform edge weight into probabilities

# Normalisation
sum_of_rows = T_array.sum(axis=1)
T_probs = T_array / sum_of_rows[:, np.newaxis]

# Rows with only 0s = nan. Replace nan values with 1/Tarray.shape[0]
np.nan_to_num(T_probs, nan=1 / T_array.shape[0], copy=False)

# Convert into a transition matrix (for random walks function)
T_directed = csr_matrix(T_probs)

### Undirected graph
*G.to_undirected() can not be used to control what data the undirected edges get, therefore we need to create a new graph and sum the edge weights*

In [None]:
# Create a copy of the edges with weight = 0
G_undirected = nx.Graph()
G_undirected.add_nodes_from(G.nodes(data=True))
G_undirected.add_edges_from(G.edges, edgeWeight=0)

# Sum weights for each edge
for u, v, d in G.edges(data=True):
    G_undirected[u][v]["edgeWeight"] += d["edgeWeight"]

In [None]:
# Create array with edge weight
T_undirected = nx.adjacency_matrix(G_undirected, weight="edgeWeight").todense()
T_undirected_array = np.array(T_undirected)

# Normalisation
sum_of_rows = T_undirected_array.sum(axis=1)
T_undirected_probs = T_undirected_array / sum_of_rows[:, np.newaxis]
np.nan_to_num(T_undirected_probs, nan=1 / T_undirected_array.shape[0], copy=False)

# Convert into a transition matrix (for random walks function)
T_undirected = csr_matrix(T_undirected_probs)

## Random walks

In [None]:
# Set the seeds from where random walks will be initialised
seeds = (
    "/browse/visas-immigration/work-visas",
    "/browse/visas-immigration/what-you-need-to-do",
    "/check-uk-visa",
    "/apply-to-come-to-the-uk",
    "/contact-ukvi-inside-outside-uk",
    "/skilled-worker-visa",
)

### Directed graph

In [None]:
# Reformat the graph to make it compliant with existing random walk functions
# i.e. add the path to a name property and set the index to be a number
for index, data in G.nodes(data=True):
    data["properties"] = dict()
    data["properties"]["name"] = index

G = nx.convert_node_labels_to_integers(
    G, first_label=0, ordering="default", label_attribute=None
)

In [None]:
results_directed = rw.repeat_random_walks(
    steps=100,
    repeats=100,
    T=T_directed,
    G=G,
    seed_pages=seeds,
    proba=True,
    combine="union",
    level=1,
    n_jobs=1,
)
page_scores_directed = rw.page_freq_path_freq_ranking(results_directed)

### Undirected graph

In [None]:
# reformat the graph to make it compliant with existing random walk functions
# i.e. add the path to a name property and set the index to be a number
for index, data in G_undirected.nodes(data=True):
    data["properties"] = dict()
    data["properties"]["name"] = index

G_undirected = nx.convert_node_labels_to_integers(
    G_undirected, first_label=0, ordering="default", label_attribute=None
)

In [None]:
results_undirected = rw.repeat_random_walks(
    steps=100,
    repeats=100,
    T=T_undirected,
    G=G_undirected,
    seed_pages=seeds,
    proba=True,
    combine="union",
    level=1,
    n_jobs=1,
)
page_scores_undirected = rw.page_freq_path_freq_ranking(results_undirected)

## Create output


In [None]:
# Document supertypes
news_and_comms_doctypes = {
    "medical_safety_alert",
    "drug_safety_update",
    "news_article",
    "news_story",
    "press_release",
    "world_location_news_article",
    "world_news_story",
    "fatality_notice",
    "fatality_notice",
    "tax_tribunal_decision",
    "utaac_decision",
    "asylum_support_decision",
    "employment_appeal_tribunal_decision",
    "employment_tribunal_decision",
    "employment_tribunal_decision",
    "service_standard_report",
    "cma_case",
    "decision",
    "oral_statement",
    "written_statement",
    "authored_article",
    "correspondence",
    "speech",
    "government_response",
    "case_study",
}

service_doctypes = {
    "completed_transaction",
    "local_transaction",
    "form",
    "calculator",
    "smart_answer",
    "simple_smart_answer",
    "place",
    "licence",
    "step_by_step_nav",
    "transaction",
    "answer",
    "guide",
}

guidance_and_reg_doctypes = {
    "regulation",
    "detailed_guide",
    "manual",
    "manual_section",
    "guidance",
    "map",
    "calendar",
    "statutory_guidance",
    "notice",
    "international_treaty",
    "travel_advice",
    "promotional",
    "international_development_fund",
    "countryside_stewardship_grant",
    "esi_fund",
    "business_finance_support_scheme",
    "statutory_instrument",
    "hmrc_manual",
    "standard",
}

policy_and_engage_doctypes = {
    "impact_assessment",
    "policy_paper",
    "open_consultation",
    "policy_paper",
    "closed_consultation",
    "consultation_outcome",
    "policy_and_engagement",
}

research_and_stats_doctypes = {
    "dfid_research_output",
    "independent_report",
    "research",
    "statistics",
    "national_statistics",
    "statistics_announcement",
    "national_statistics_announcement",
    "official_statistics_announcement",
    "statistical_data_set",
    "official_statistics",
}

transparency_doctypes = {
    "transparency",
    "corporate_report",
    "foi_release",
    "aaib_report",
    "raib_report",
    "maib_report",
}

### Directed graph

In [None]:
# Create a df with `pagePath`: `documentType`, `sessionHitsAll`, `entranceHit`, `exitHit`, `entranceAndExitHit`
df_dict = {
    info["properties"]["name"]: [
        info["documentType"],
        info["sessionHitsAll"],
        info["entranceHit"],
        info["exitHit"],
        info["entranceAndExitHit"],
        info["sessionHits"],
    ]
    for node, info in G.nodes(data=True)
}
df_dict = {
    k: v for (k, v) in df_dict.items() if k in page_scores_directed["pagePath"].tolist()
}
df_info = (
    pd.DataFrame.from_dict(
        df_dict,
        orient="index",
        columns=[
            "documentType",
            "sessionHitsAll",
            "entranceHit",
            "exitHit",
            "entranceAndExitHit",
            "sessionHits",
        ],
    )
    .rename_axis("pagePath")
    .reset_index()
)

In [None]:
# Create a df with document supertypes
document_type_dict = dict.fromkeys(list(set(df_info["documentType"])))

for docType, docSupertype in document_type_dict.items():
    if docType in news_and_comms_doctypes:
        document_type_dict[docType] = "news and communication"

    elif docType in service_doctypes:
        document_type_dict[docType] = "services"

    elif docType in guidance_and_reg_doctypes:
        document_type_dict[docType] = "guidance and regulation"

    elif docType in policy_and_engage_doctypes:
        document_type_dict[docType] = "policy and engagement"

    elif docType in research_and_stats_doctypes:
        document_type_dict[docType] = "research and statistics"

    elif docType in transparency_doctypes:
        document_type_dict[docType] = "transparency"

    else:
        document_type_dict[docType] = "other"

df_docSuper = pd.DataFrame(
    document_type_dict.items(), columns=["documentType", "documentSupertype"]
)

In [None]:
# Merge dfs
df_merged = pd.merge(page_scores_directed, df_info, on="pagePath")
df_merged = pd.merge(df_merged, df_docSuper, how="left")

# Reoder and rename df columns
df_merged = df_merged[
    [
        "pagePath",
        "documentType",
        "documentSupertype",
        "sessionHitsAll",
        "entranceHit",
        "exitHit",
        "entranceAndExitHit",
        "sessionHits",
        "tfdf_max",
    ]
]
df_merged = df_merged.rename(
    columns={
        "pagePath": "page path",
        "documentType": "document type",
        "documentSupertype": "document supertype",
        "sessionHitsAll": "number of sessions that visit this page",
        "entranceHit": "number of sessions where this page is an entrance hit",
        "exitHit": "number of sessions where this page is an exit hit",
        "entranceAndExitHit": "number of sessions where this page is both an entrance and exit hit",
        "sessionHits": "all sessions that visit this page, regardless of the session visiting a seed page",
        "tfdf_max": "how frequent the page occurs in the whole user journey",
    }
)

# Save df
df_merged.to_csv("../../outputs/pages_ranked_directed_uk.csv", index=False)

### Undirected graph

In [None]:
# Create a df with `pagePath`: `documentType`, `sessionHitsAll`, `entranceHit`, `exitHit`, `entranceAndExitHit`
df_dict = {
    info["properties"]["name"]: [
        info["documentType"],
        info["sessionHitsAll"],
        info["entranceHit"],
        info["exitHit"],
        info["entranceAndExitHit"],
        info["sessionHits"],
    ]
    for node, info in G_undirected.nodes(data=True)
}
df_dict = {
    k: v
    for (k, v) in df_dict.items()
    if k in page_scores_undirected["pagePath"].tolist()
}
df_info = (
    pd.DataFrame.from_dict(
        df_dict,
        orient="index",
        columns=[
            "documentType",
            "sessionHitsAll",
            "entranceHit",
            "exitHit",
            "entranceAndExitHit",
            "sessionHits",
        ],
    )
    .rename_axis("pagePath")
    .reset_index()
)

In [None]:
# Create a df with document supertypes
document_type_dict = dict.fromkeys(list(set(df_info["documentType"])))

for docType, docSupertype in document_type_dict.items():
    if docType in news_and_comms_doctypes:
        document_type_dict[docType] = "news and communication"

    elif docType in service_doctypes:
        document_type_dict[docType] = "services"

    elif docType in guidance_and_reg_doctypes:
        document_type_dict[docType] = "guidance and regulation"

    elif docType in policy_and_engage_doctypes:
        document_type_dict[docType] = "policy and engagement"

    elif docType in research_and_stats_doctypes:
        document_type_dict[docType] = "research and statistics"

    elif docType in transparency_doctypes:
        document_type_dict[docType] = "transparency"

    else:
        document_type_dict[docType] = "other"

df_docSuper = pd.DataFrame(
    document_type_dict.items(), columns=["documentType", "documentSupertype"]
)

In [None]:
# Merge dfs
df_merged = pd.merge(page_scores_undirected, df_info, on="pagePath")
df_merged = pd.merge(df_merged, df_docSuper, how="left")

# Reoder and rename df columns
df_merged = df_merged[
    [
        "pagePath",
        "documentType",
        "documentSupertype",
        "sessionHitsAll",
        "entranceHit",
        "exitHit",
        "entranceAndExitHit",
        "sessionHits",
        "tfdf_max",
    ]
]
df_merged = df_merged.rename(
    columns={
        "pagePath": "page path",
        "documentType": "document type",
        "documentSupertype": "document supertype",
        "sessionHitsAll": "number of sessions that visit this page",
        "entranceHit": "number of sessions where this page is an entrance hit",
        "exitHit": "number of sessions where this page is an exit hit",
        "entranceAndExitHit": "number of sessions where this page is both an entrance and exit hit",
        "sessionHits": "all sessions that visit this page, regardless of the session visiting a seed page",
        "tfdf_max": "how frequent the page occurs in the whole user journey",
    }
)

# Save df
df_merged.to_csv("../../outputs/pages_ranked_undirected_uk.csv", index=False)