Обработка данных

In [None]:
import numpy as np
import pandas as pd

filename = 'worklist.csv'
sep1 = ';'
start_state = 'start General course'
end_states = ['Finished course', 'Dropped General', 'fell asleep']
steps = 1000

# Чтение данных без заголовков
df = pd.read_csv(filename, sep=sep1, header=None)

# Задание имен столбцов вручную
df.columns = ['student_id', 'state', 'event_time']

# Преобразование времени событий и сортировка
df['event_time'] = df['event_time'].str.replace(',', '.').astype(float)
df = df.sort_values(by=['student_id', 'event_time'])

states = df['state'].unique().tolist()

# Создание матрицы переходов
calc_matrix = np.zeros((len(states), len(states)), dtype=int)

for i in range(len(df) - 1):
    if df.iloc[i]['student_id'] == df.iloc[i + 1]['student_id']:
        from_state = df.iloc[i]['state']
        to_state = df.iloc[i + 1]['state']
        calc_matrix[states.index(from_state), states.index(to_state)] += 1

for end_state in end_states:
    calc_matrix[states.index(end_state), :] = 0

TransitionMatrix = np.zeros_like(calc_matrix, dtype=float)

for i in range(len(calc_matrix)):
    row_sum = calc_matrix[i].sum()
    if row_sum > 0:
        TransitionMatrix[i] = calc_matrix[i] / row_sum

# Обработка конечных состояний
for i in range(len(TransitionMatrix)):
    if TransitionMatrix[i].sum() == 0:
        TransitionMatrix[i, i] = 1

# Начальный вектор
InitialVector = np.zeros(len(states))
InitialVector[states.index(start_state)] = 1

# Расчет вероятностей
ProbabilityMatrix = np.linalg.matrix_power(TransitionMatrix, steps)
ProbabilityVector = np.dot(InitialVector, ProbabilityMatrix)

# Округление вероятностей
ProbabilityVector = np.round(ProbabilityVector, 2)
Result = dict(zip(states, ProbabilityVector))
print('\nProbability Vector:\n', Result)

# Вероятностная матрица
ProbabilityMatrix = np.linalg.matrix_power(TransitionMatrix, steps)


Probability Vector:
 {'start General course': 0.0, 'successful class': 0.0, 'Dropped General': 0.29, 'level up': 0.0, 'Finished course': 0.15, 'fell asleep': 0.56, 'App session': 0.0, 'level down': 0.0}


Делаем вывод по вероятностям:

Finished course: 0.148431

Dropped General: 0.294281

fell asleep: 0.557288

Граф

Создание графа

In [None]:
!apt-get install graphviz
!pip install pydot pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-6).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
import networkx as nx

G = nx.MultiDiGraph()

for i in range(len(TransitionMatrix)):
    for j in range(len(TransitionMatrix)):
        if round(TransitionMatrix[i][j], 2) != 0:
            G.add_edge(states[i], states[j], weight=round(TransitionMatrix[i][j], 2), label="{:.02f}".format(TransitionMatrix[i][j]))

# Добавление размеров (вероятностей) к узлам графа
for i in range(len(TransitionMatrix)):
    G.add_node(states[i], size=int(ProbabilityVector[i] * 100))

# Сохранение графа в файл .gexf
nx.write_gexf(G, "Graph.gexf")

Сохранение графа

In [None]:
import pydot
import fitz
from PIL import Image
# Сохранение графа в файл .dot
nx.drawing.nx_pydot.write_dot(G, "Graph.dot")

# Преобразование графа в PDF
(graph,) = pydot.graph_from_dot_file('Graph.dot')
graph.write_pdf('Graph.pdf')

Расчет по формулам

In [None]:
transient_states = [state for state in states if state not in end_states]
Q = np.zeros((len(transient_states), len(transient_states)))

for i, state1 in enumerate(transient_states):
    for j, state2 in enumerate(transient_states):
        Q[i, j] = TransitionMatrix[states.index(state1), states.index(state2)]

R_matrix = np.zeros((len(transient_states), len(end_states)))
for i, state1 in enumerate(transient_states):
    for j, state2 in enumerate(end_states):
        R_matrix[i, j] = TransitionMatrix[states.index(state1), states.index(state2)]

# Находим матрицу фундаментальных состояний
I = np.eye(len(transient_states))
N = np.linalg.inv(I - Q)

# Математическое ожидание времени до поглощения
expected_times = N.sum(axis=1)

# Дисперсия времени до поглощения
var_times = (2 * N - I).dot(expected_times) - expected_times**2

# Результаты
expected_time_to_absorption = dict(zip(transient_states, expected_times))
variance_time_to_absorption = dict(zip(transient_states, var_times))

print('\nExpected Time to Absorption:\n', expected_time_to_absorption['start General course'])
print('\nVariance of Time to Absorption:\n', variance_time_to_absorption['start General course'])


Expected Time to Absorption:
 21.62065082733475

Variance of Time to Absorption:
 441.12230685916813


Моделирование

In [None]:
import numpy as np

num_simulations = 100000  # Количество симуляций

# Функция для моделирования процесса
def simulate_chain(transition_matrix, states, start_state, end_states, max_steps=1000):
    current_state = start_state
    time = 0
    while current_state not in end_states and time < max_steps:
        current_index = states.index(current_state)
        next_state = np.random.choice(states, p=transition_matrix[current_index])
        current_state = next_state
        time += 1
    return current_state, time # Возвращаем достигнутое состояние и время

# Счетчики для конечных состояний
end_state_counts = {end_state: 0 for end_state in end_states}
times_to_absorption = []

# Моделирование симуляций
for _ in range(num_simulations):
    end_state, time = simulate_chain(TransitionMatrix, states, start_state, end_states, steps)
    end_state_counts[end_state] += 1
    times_to_absorption.append(time)

# Расчет вероятностей попадания в каждое конечное состояние
probabilities = {end_state: count / num_simulations for end_state, count in end_state_counts.items()}

# Рассчет математического ожидания и дисперсии
expected_time = np.mean(times_to_absorption)
variance_time = np.var(times_to_absorption)

# Вывод результатов
print('\nExpected Time to Absorption:', expected_time)
print('Variance of Time to Absorption:', variance_time)
print('\nProbabilities of Reaching Each End State:')
for end_state, probability in probabilities.items():
    print(f'{end_state}: {probability:.4f}')



Expected Time to Absorption: 21.58539
Variance of Time to Absorption: 440.34006854790005

Probabilities of Reaching Each End State:
Finished course: 0.1487
Dropped General: 0.2932
fell asleep: 0.5581


Итог

Ниже приведена разность расчетных значений со значениями, полученными моделированием.

In [None]:
print('\nExpected Time to Absorption:', expected_time_to_absorption['start General course'] - expected_time)
print('Variance of Time to Absorption:', variance_time_to_absorption['start General course'] - variance_time)

print('\nProbabilities of Reaching Each End State:')
i=0
for end_state, probability in probabilities.items():
    print(f'{end_state}: {B[0][i]-probability:.4f}')
    i+=1


Expected Time to Absorption: 0.03526082733474922
Variance of Time to Absorption: 0.7822383112680882

Probabilities of Reaching Each End State:
Finished course: -0.0003
Dropped General: 0.0011
fell asleep: -0.0008
