In [2]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go

---

## Чтение обработанной таблицы с темпами

---

In [6]:
df = pd.read_csv('stroke.csv', index_col=0)
df

Unnamed: 0_level_0,_50,_100,_150,_200,_250,_300,_350,_400,_450,_500,...,_1800,_1850,_1900,_1950,_2000,mean_stroke_500,mean_stroke_1000,mean_stroke_1500,mean_stroke_2000,date_competition
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00090381-a835-4600-b27b-7036a7ef511b,40,40,40,38,37,36,35,34,34,34,...,33,33,33,33,33,36,33,32,33,2010-05-30
00dc4915-dcbf-4651-8af3-6db3d348a9d4,40,40,40,40,40,40,40,40,39,39,...,38,39,41,41,41,39,37,36,38,2010-05-30
010b059a-61f6-48bf-bd4c-0a28d9428391,40,40,40,40,39,36,34,33,32,32,...,31,31,32,31,31,36,32,32,31,2010-05-30
01116325-195b-4d3b-8ef1-cb281ebc7ccd,40,40,39,38,37,36,36,35,35,35,...,34,35,35,35,35,37,33,33,34,2010-05-30
01357d80-9474-4325-a9f2-3efaa98f4300,41,41,42,41,39,37,36,35,34,34,...,29,28,30,30,29,38,31,29,29,2010-05-30
034c3e35-b2e4-4cc4-821a-aff3904d171d,40,41,41,40,40,39,38,38,38,37,...,37,38,38,39,39,39,36,35,37,2010-05-30
038a4b2b-1b85-4fc7-a031-439ed8be4e4b,41,39,40,38,36,35,36,35,34,36,...,37,38,36,34,33,37,36,36,37,2010-05-30
03eb764e-187c-4070-a423-84f895b30184,40,40,40,40,39,38,34,31,30,29,...,31,31,31,31,31,36,30,30,30,2010-05-30
03ed9d80-290b-4f13-9093-12961cca7809,40,40,40,41,42,41,42,40,42,40,...,34,34,34,34,32,40,36,34,33,2010-05-30
0471508f-6e75-458f-8f3e-28848af55559,41,41,41,40,40,40,39,38,38,38,...,38,38,39,40,34,39,37,36,37,2010-05-30


In [7]:
def iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1

    # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
    return series[(Q1 - 1.5 * IQR <= series) & (series <= Q3 + 1.5 * IQR)]

In [12]:
def plot_stroke_scatter(df):

    label_columns = [
        "_50", "_100", "_150", "_200", "_250", "_300", "_350", "_400", "_450","_500",
        "_550", "_600", "_650", "_700", "_750", "_800", "_850", "_900", "_950", "_1000",
        "_1050", "_1100", "_1150", "_1200", "_1250", "_1300", "_1350", "_1400", "_1450", "_1500",
        "_1550", "_1600", "_1650", "_1700", "_1750", "_1800", "_1850", "_1900", "_1950", "_2000"
    ]

    fig = go.Figure()

    for col in label_columns:
        filtered = iqr(df[col])
        size = filtered.value_counts(normalize=True, sort=False).sort_index()
        moda = size.idxmax()
        color = ['rgb(25, 70, 186, .7)'] * len(size)
        color[size.index.get_loc(moda)] = 'rgb(236, 11, 67)'

        fig.add_trace(
            go.Scatter(
                x=[col[1:]] * len(size),
                y=size.index,
                name=col[1:],
                mode='markers',
                    marker=dict(
                        size=size,
                        sizemode='area',
                        sizeref=2.*max(size)/(10.**2),
                        color=color
                    )
            )
        )

    fig.update_layout(showlegend=False, 
                      title="Наиболее используемые темпа", 
                      xaxis_title="Дистанция (м)",
                      yaxis_title="Темп (1/мин)")
    fig.show()

plot_stroke_scatter(df)

Наиболее используемые темпа среди всех классов лодок и участников

---

In [11]:
def plot_stroke_violin(df):
    label_columns = [
        'mean_stroke_500', 
        'mean_stroke_1000', 
        'mean_stroke_1500', 
        'mean_stroke_2000'
    ]

    fig = go.Figure()
    
    for col in label_columns:
        filtered = iqr(df[col])
        
        fig.add_trace(
            go.Violin(
                x=[col[:]] * len(filtered),
                y=filtered,
                name=col[:],
                box_visible=True,
            )
        )

    fig.update_layout(showlegend=False, 
                      title="Наиболее используемые темпа",     
                      xaxis_title="Дистанция (м)",
                      yaxis_title="Темп (1/мин)",)
    fig.show()

plot_stroke_violin(df)

Наиболее используемые средние темпа для кусков 500, 1000, 1500, 2000 метров с использованием ядерной оценки плотности (KDE)

---

In [15]:
def plot_heatmap_stroke(df, split=1):
    label_columns = [
        "_50", "_100", "_150", "_200", "_250", "_300", "_350", "_400", "_450","_500",
        "_550", "_600", "_650", "_700", "_750", "_800", "_850", "_900", "_950", "_1000",
        "_1050", "_1100", "_1150", "_1200", "_1250", "_1300", "_1350", "_1400", "_1450", "_1500",
        "_1550", "_1600", "_1650", "_1700", "_1750", "_1800", "_1850", "_1900", "_1950", "_2000"
    ]
    
    filtered = iqr(df[label_columns])
    label = [col[1:] for col in label_columns]
    
    if split != 1:
        split_df = pd.DataFrame()
        columns = [arr.tolist() for arr in np.array_split(label_columns, split)]
        for cols in columns:
            split_df[cols[-1][1:]] = filtered[cols].mean(axis=1)
        filtered = split_df
        
        step = len(label) // split
        label = label[step-1::step]
        
    

    fig = go.Figure(data=go.Heatmap(x=label,
                                    y=label,
                                    z=filtered.corr(),
                                    colorscale = "YlOrRd"))

    fig.show()
    
plot_heatmap_stroke(df, split=40)

Кореляция между темпами для каждых 50 метров на дистанции 2000м

---

In [14]:
plot_heatmap_stroke(df, split=8)

Кореляция между темпами для каждых 250 метров на дистанции 2000м

---

In [16]:
plot_heatmap_stroke(df, split=4)

Кореляция между темпами для каждых 500 метров на дистанции 2000м

---