In [1]:
from collections import deque, defaultdict
import pandas as pd

In [2]:
pagelinks_df = pd.read_csv('simple_english_wiki_pagelinks.csv')
pages_df = pd.read_csv('simple_english_wiki_pages.csv')


In [3]:
graph = defaultdict(set)
for _, row in pagelinks_df.iterrows():
    graph[row['pl_from']].add(row['pl_to'])


def bfs_search(graph, start, end):
    """
    поиск в ширину в графе
    """
    queue = deque([(start, [start])])
    visited = set()

    while queue:
        current, path = queue.popleft()
        if current == end:
            return path
        if current not in visited:
            visited.add(current)
            for neighbor in graph[current]:
                queue.append((neighbor, path + [neighbor]))

    return None 


In [29]:
start_page = pages_df[pages_df['page_title'] == 'Analytics']['page_id'].values[0]
end_page = pages_df[pages_df['page_title'] == 'Algorithm']['page_id'].values[0]

shortest_path = bfs_search(graph, start_page, end_page)

if shortest_path:
    print(f"самый короткий путь от 'Analytics' к 'Algorithm': {shortest_path}")
    print(f"число переходов: {len(shortest_path) - 1}")
else:
    print("нет пути :с")

самый короткий путь от 'Analytics' к 'Algorithm': [747593, 31531, 2957, 110, 170677]
число переходов: 4


In [31]:
def find_title_by_id(pages_df, page_id):
    """
    поиска заголовка страницы по её id
    """
    row = pages_df[pages_df['page_id'] == page_id]
    if not row.empty:
        return row['page_title'].values[0]
    else:
        return None


my_page_id = 110  
title = find_title_by_id(pages_df, my_page_id)

if title:
    print(f"статью с page_id {my_page_id} зовут: {title}")
else:
    print(f"нет такой статьи :с")

статью с page_id 110 зовут: Computer_science


In [None]:
graph = defaultdict(dict)
for _, row in pagelinks_df.iterrows():
    graph[row['pl_from']][row['pl_to']] = len(str(row['pl_title']))

def dijkstra(graph, start, end):
    """
    будем искать с помощью дейкстры
    """
    priority_queue = [(0, start, [])]
    visited = set()

    while priority_queue:
        cost, current, path = min(priority_queue)
        priority_queue.remove((cost, current, path))

        if current == end:
            return path + [current]

        if current not in visited:
            visited.add(current)
            for neighbor, weight in graph[current].items():
                priority_queue.append((cost + weight, neighbor, path + [current]))

    return None 

start_page = pages_df[pages_df['page_title'] == 'Analytics']['page_id'].values[0]
end_page = pages_df[pages_df['page_title'] == 'Algorithm']['page_id'].values[0]

shortest_path = dijkstra(graph, start_page, end_page)

In [33]:
new_page_id = 4069
if shortest_path:
    print(f"самый короткий путь от 'Analytics' к 'Algorithm': {shortest_path}")
    
    total_length = sum(graph[shortest_path[i]][shortest_path[i+1]] for i in range(len(shortest_path)-1))
    
    print(f"общая длина переходов: {total_length}")

    title = find_title_by_id(pages_df, new_page_id)

    print(f"статью с page_id {new_page_id} зовут: {title}")
else:
    print("нет пути :с")


самый короткий путь от 'Analytics' к 'Algorithm': [747593, 2958, 911, 6309, 5723, 4069, 170677]
общая длина переходов: 29
статью с page_id 4069 зовут: Logic
