In [420]:
from __future__ import annotations

import cufflinks as cf  # noqa
import pandas as pd
import plotly.offline  # noqa

In [421]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True, theme='white')

In [422]:
%reload_ext autoreload
%autoreload 2

# Страна исследования: Сингапур

In [423]:
data_df = pd.read_csv('data/csv/data.csv', sep=',', header=2)
data_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Singapore,SGP,"Intentional homicides, female (per 100,000 fem...",VC.IHR.PSRC.FE.P5,,,,,,,...,2.344646e-01,2.308808e-01,2.657929e-01,1.125425e-01,2.597589e-01,1.837857e-01,2.551097e-01,,,
1,Singapore,SGP,Merchandise exports to low- and middle-income ...,TX.VAL.MRCH.R5.ZS,1.273683e+01,,1.314017e+01,9.845885e+00,,,...,3.868837e+00,4.159699e+00,4.319948e+00,4.484547e+00,4.452890e+00,4.661110e+00,4.580456e+00,4.395976e+00,,
2,Singapore,SGP,Export unit value index (2000 = 100),TX.UVI.MRCH.XD.WD,,,,,,,...,1.123405e+02,1.082040e+02,1.071401e+02,8.882217e+01,8.326430e+01,8.888975e+01,9.429905e+01,9.196523e+01,,
3,Singapore,SGP,Merchandise imports from low- and middle-incom...,TM.VAL.MRCH.R2.ZS,,,,,,,...,1.524159e+00,1.785507e+00,2.628323e+00,2.131843e+00,1.256616e+00,1.793111e+00,1.901593e+00,1.684297e+00,,
4,Singapore,SGP,"Share of tariff lines with specific rates, pri...",TM.TAX.TCOM.SR.ZS,,,,,,,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,Singapore,SGP,Trade (% of GDP),NE.TRD.GNFS.ZS,3.393066e+02,2.994019e+02,2.866445e+02,2.968815e+02,2.550307e+02,2.575412e+02,...,3.692130e+02,3.670418e+02,3.604673e+02,3.294714e+02,3.033184e+02,3.157393e+02,3.253423e+02,3.235176e+02,3.205635e+02,
1439,Singapore,SGP,Gross capital formation (current US$),NE.GDI.TOTL.CD,7.987064e+07,8.813537e+07,1.277930e+08,1.591206e+08,1.771201e+08,2.115837e+08,...,8.635120e+10,9.220986e+10,9.266072e+10,7.808889e+10,8.439472e+10,9.381794e+10,9.394617e+10,9.231967e+10,7.690527e+10,
1440,Singapore,SGP,Exports of goods and services (current LCU),NE.EXP.GNFS.CN,3.512800e+09,3.333700e+09,3.493500e+09,3.965000e+09,3.341700e+09,3.676900e+09,...,7.254439e+11,7.507965e+11,7.657946e+11,7.553591e+11,7.254890e+11,8.111598e+11,8.979204e+11,8.983483e+11,8.267387e+11,
1441,Singapore,SGP,Households and NPISHs Final consumption expend...,NE.CON.PRVT.PP.CD,,,,,,,...,1.281558e+11,1.301487e+11,1.358406e+11,1.436858e+11,1.478207e+11,1.544493e+11,1.653194e+11,1.739774e+11,1.500816e+11,


In [424]:
data_router = {
    'Y': {
        'indicator': 'GDP (constant 2010 US$)',
        'ru': {
            'full': 'Объем ВВП, $',
            'short': 'ВВП, $'
        }
    },
    'C': {
        'indicator': 'Final consumption expenditure (% of GDP)',
        'ru': {
            'full': 'Уровень потребления, $',
            'short': 'Потребление, $'
        }
    },
    'I': {
        'indicator': 'Net investment in nonfinancial assets (% of GDP)',
        'ru': {
            'full': 'Объем инвестиций, $',
            'short': 'Инвестиции, $'
        }
    },
    'G': {
        'indicator': 'General government final consumption expenditure (constant 2010 US$)',
        'ru': {
            'full': 'Величина государственных расходов, $',
            'short': 'Гос. расходы, $'
        }
    },
    'Y(t-1)': {
        'indicator': None,
        'ru': {
            'full': 'Объем ВВП за предыдущий период, $',
            'short': 'ВВП(пред. период), $'
        }
    },
    'Y(t-1) - Y(t-2)': {
        'indicator': None,
        'ru': {
            'full': 'Прирост ВВП за предыдущий период, $',
            'short': 'Прирост ВВП(пред. период), $'
        }
    },
    'G(t-1)': {
        'indicator': None,
        'ru': {
            'full': 'Величина государственных расходов за предыдущий период, $',
            'short': 'Гос. расходы(пред. период), $'
        }
    },
}

### Объем ВВП

In [425]:
indicator = data_router['Y']['indicator']
gdp_series = data_df[data_df['Indicator Name'] == indicator]
gdp_series = gdp_series.loc[:, '1960':'2020'].squeeze()

### Уровень потребления

In [426]:
indicator = data_router['C']['indicator']
fce_series = data_df[data_df['Indicator Name'] == indicator]
fce_series = fce_series.loc[:, '1960':'2020'].squeeze()
fce_series = fce_series * gdp_series / 100

### Объем инвестиций

In [427]:
indicator = data_router['I']['indicator']
investment_series = data_df[data_df['Indicator Name'] == indicator]
investment_series = investment_series.loc[:, '1960':'2020'].squeeze()
investment_series = investment_series * gdp_series / 100

### Величина государственных расходов

In [428]:
indicator = data_router['G']['indicator']
gfce_series = data_df[data_df['Indicator Name'] == indicator]
gfce_series = gfce_series.loc[:, '1960':'2020'].squeeze()

In [429]:
df = pd.concat(
    [
        gdp_series.rename('Y'),
        fce_series.rename('C'),
        investment_series.rename('I'),
        gfce_series.rename('G')
    ],
    axis=1
)
df.index = pd.to_datetime(df.index.values)

In [430]:
df = df / 1_000_000
for k in data_router:
    for form in data_router[k]['ru']:
        data_router[k]['ru'][form] = data_router[k]['ru'][form].replace('$', 'млн. $')

df

Unnamed: 0,Y,C,I,G
1960-01-01,5768.016072,5592.360813,,436.828932
1961-01-01,6237.353063,6196.316093,,535.679734
1962-01-01,6708.525608,6477.773334,,586.067752
1963-01-01,7382.103260,6869.376900,,681.665573
1964-01-01,7153.012025,6615.987338,,694.810274
...,...,...,...,...
2016-01-01,308895.962132,144473.392591,-3138.312297,29747.120836
2017-01-01,322859.231805,146496.813523,,30666.386827
2018-01-01,334151.135292,148811.619317,,31635.310574
2019-01-01,338646.194793,155587.359202,,32705.143133


# Диаграммы рассеяния

In [431]:
scatter_config = {
    'kind': 'scatter',
    'mode': 'markers',
    'size': 8,
}

In [432]:
df['Y(t-1)'] = df['Y'].shift(1)
df['Y(t-1) - Y(t-2)'] = df['Y(t-1)'] - df['Y'].shift(2)
df['G(t-1)'] = df['G'].shift(1)
df['C + G + I'] = df['C'] + df['G'] + df['I']
df

Unnamed: 0,Y,C,I,G,Y(t-1),Y(t-1) - Y(t-2),G(t-1),C + G + I
1960-01-01,5768.016072,5592.360813,,436.828932,,,,
1961-01-01,6237.353063,6196.316093,,535.679734,5768.016072,,436.828932,
1962-01-01,6708.525608,6477.773334,,586.067752,6237.353063,469.336991,535.679734,
1963-01-01,7382.103260,6869.376900,,681.665573,6708.525608,471.172545,586.067752,
1964-01-01,7153.012025,6615.987338,,694.810274,7382.103260,673.577652,681.665573,
...,...,...,...,...,...,...,...,...
2016-01-01,308895.962132,144473.392591,-3138.312297,29747.120836,298944.012931,8674.756736,28648.011444,171082.201129
2017-01-01,322859.231805,146496.813523,,30666.386827,308895.962132,9951.949201,29747.120836,
2018-01-01,334151.135292,148811.619317,,31635.310574,322859.231805,13963.269673,30666.386827,
2019-01-01,338646.194793,155587.359202,,32705.143133,334151.135292,11291.903487,31635.310574,


In [433]:
df[['Y', 'C', 'I', 'G']].scatter_matrix()

In [434]:
df.iplot(
    x='Y(t-1)', y='C',
    xTitle=f'{data_router["Y(t-1)"]["ru"]["short"]}',
    yTitle=f'{data_router["C"]["ru"]["short"]}',
    **scatter_config,
)

In [435]:
df.iplot(
    x='Y(t-1) - Y(t-2)', y='C',
    xTitle=f'{data_router["Y(t-1) - Y(t-2)"]["ru"]["short"]}',
    yTitle=f'{data_router["C"]["ru"]["short"]}',
    **scatter_config,
)

In [436]:
df.iplot(
    x='G(t-1)', y='G',
    xTitle=f'{data_router["G(t-1)"]["ru"]["short"]}',
    yTitle=f'{data_router["G"]["ru"]["short"]}',
    **scatter_config,
)

In [437]:
df.iplot(
    x='C', y='Y',
    xTitle=f'{data_router["C"]["ru"]["short"]}',
    yTitle=f'{data_router["Y"]["ru"]["short"]}',
    **scatter_config,
)

In [438]:
df.iplot(
    x='G', y='Y',
    xTitle=f'{data_router["G"]["ru"]["short"]}',
    yTitle=f'{data_router["Y"]["ru"]["short"]}',
    **scatter_config,
)

In [439]:
df.iplot(
    x='I', y='Y',
    xTitle=f'{data_router["I"]["ru"]["short"]}',
    yTitle=f'{data_router["Y"]["ru"]["short"]}',
    **scatter_config,
)

In [440]:
df.iplot(
    x='C + G + I', y='Y',
    xTitle=f'{data_router["C"]["ru"]["short"][:-8]} + '
           f'{data_router["G"]["ru"]["short"][:-8]} + '
           f'{data_router["I"]["ru"]["short"][:-8]}, млн. $',
    yTitle=f'{data_router["Y"]["ru"]["short"]}',
    **scatter_config,
)

# Описательная статистика

In [441]:
df: pd.DataFrame = df[['Y', 'C', 'I', 'G']]
df

Unnamed: 0,Y,C,I,G
1960-01-01,5768.016072,5592.360813,,436.828932
1961-01-01,6237.353063,6196.316093,,535.679734
1962-01-01,6708.525608,6477.773334,,586.067752
1963-01-01,7382.103260,6869.376900,,681.665573
1964-01-01,7153.012025,6615.987338,,694.810274
...,...,...,...,...
2016-01-01,308895.962132,144473.392591,-3138.312297,29747.120836
2017-01-01,322859.231805,146496.813523,,30666.386827
2018-01-01,334151.135292,148811.619317,,31635.310574
2019-01-01,338646.194793,155587.359202,,32705.143133


### Дисперсия

In [442]:
df.var()

Y    1.108403e+10
C    2.250774e+09
I    1.357648e+07
G    1.043170e+08
dtype: float64

### Стандартное отклонение

In [443]:
df.std()

Y    105280.731050
C     47442.323900
I      3684.627763
G     10213.568642
dtype: float64

### Среднее

In [444]:
df.mean()

Y    109814.320581
C     55111.324553
I     -1905.442287
G     10764.772821
dtype: float64

### Медиана

In [445]:
df.median()

Y    68779.542852
C    37206.186686
I      -13.357079
G     6219.102965
dtype: float64

### Мода

In [446]:
from typing import Literal
from operator import add, sub

# И кто говорил, что у Яндекса плохие собеседования?


def interval_mode(s: pd.Series, bins) -> float | pd.Series | None:  # почему ты вообще работаешь?
    """
    Формула
    -------
    x + h * (mc - mc_prev) / ((mc - mc_prev) + (mc - mc_next))
    x - левая граница модального интервала
    h - длина модального интервала
    mc - частота модального интервала
    mc_prev - частота предыдущего модального интервала
    mc_next - частота следующего модального интервала

    Особенные случаи
    ----------------
    – если модальный интервал крайний, то m-1 = 0 либо m+1 = 0;
    – если обнаружатся несколько модальных интервалов, которые находятся рядом,
      то рассматриваем модальный интервал (левая граница левого, правая граница правого)
    – если между модальными интервалами есть расстояние,
      то применяем формулу к каждому интервалу,
      получая тем самым 2 или большее количество мод.
    """

    def _interval_mode(_interval: pd.Interval, _freq, _prev_freq=0, _next_freq=0) -> float | None:
        _denominator = (_freq - _prev_freq) + (_freq - _next_freq)
        if _denominator != 0:
            return _interval.left + _interval.length * (_freq - _prev_freq) / _denominator

    intervals = pd.cut(s, bins)
    intervals_freq = intervals.value_counts(sort=False)
    max_freq = intervals_freq.max()
    max_freq_intervals = intervals_freq[intervals_freq == max_freq]

    if len(max_freq_intervals) == 0:
        return None

    elif len(max_freq_intervals) == 1:
        return pd.Series(_interval_mode(max_freq_intervals.index[0], max_freq))

    else:

        def _expand_intervals(_intervals: pd.Series) -> list[dict]:
            _expanded = []
            _i = 0
            _prev: pd.Interval = _intervals.index[_i]
            while _i < len(_intervals) - 1:
                _current: pd.Interval = _intervals.index[_i]
                _next: pd.Interval = _intervals.index[_i + 1]

                left = _current.left
                right = _current.right
                freq = _intervals.iloc[_i]
                n_expanded = 0

                if left == _prev.right:
                    left = _prev.left  # расширяем левую границу
                    freq += _intervals[_prev]
                    n_expanded += 1

                if right == _next.left:
                    right = _next.right  # чуть-чуть расширяем правую границу
                    freq += _intervals[_next]
                    n_expanded += 1

                    _offset = 1
                    for _next_next in _intervals.index[_i + 2:]:
                        if _next_next.left == right:
                            right = _next_next.right  # если возможно расширяем ещё
                            freq += _intervals[_next_next]
                            n_expanded += 1
                            _offset += 1
                        else:
                            break

                    _i += _offset

                _expanded.append({
                    'freq': freq,
                    'interval': pd.Interval(left, right),
                    'n': n_expanded,
                })
                _prev = _current
                _i += 1

            if _i < len(_intervals):
                _expanded.append({
                    'freq': _intervals.iloc[-1],
                    'interval': _intervals.index[-1],
                    'n': 0,
                })

            return _expanded

        def _calc_prev_next_freq(_targets: list[dict], _intervals: pd.Series) -> list[tuple[int, int]]:

            def _expand(mode: Literal['left', 'right'], _intervals: pd.Series, _i: int, _n: int) -> int:
                freq = _intervals.iloc[_i]
                _op = sub if mode == 'left' else add
                _j = 0
                while _j < _n and _op(_i, _j) > 0:  # возможно while лишний, но сегодня его день
                    _j += 1
                    freq += _intervals.iloc[_op(_i, _j)]

                return freq

            _prev_next_freq = [
                {
                    'left_freq': _expand('left', _intervals, 0, _targets[0]['n']),
                    'right_freq': _expand('right', _intervals, 0, _targets[0]['n'])
                },
            ]
            _i = 0
            for _target in _targets[1:]:
                left_freq, right_freq = None, 0
                while not (left_freq and right_freq) and _i < len(_intervals):

                    if left_freq is None:
                        if _intervals.index[_i].right > _target['interval'].left:
                            if _i == 0:
                                left_freq = 0
                            else:
                                left_freq = _expand('left', _intervals, _i - 1, _target['n'])
                    else:
                        if _target['interval'].right <= _intervals.index[_i].left:
                            right_freq = _expand('right', _intervals, _i, _target['n'])

                    _i += 1

                _prev_next_freq.append({
                    'left_freq': left_freq,
                    'right_freq': right_freq
                })

            return _prev_next_freq

        expanded_intervals = _expand_intervals(max_freq_intervals)
        prev_next_freq = _calc_prev_next_freq(expanded_intervals, intervals_freq)

        return pd.Series([
            _interval_mode(
                expanded_intervals[i]['interval'],
                expanded_intervals[i]['freq'],
                prev_next_freq[i]['left_freq'],
                prev_next_freq[i]['right_freq'],
            )
            for i in range(len(expanded_intervals))
        ]).dropna().reset_index(drop=True)


df.agg(lambda x: interval_mode(x, 10))

Unnamed: 0,Y,C,I,G
0,22245.486,13017.1135,380.786,2237.9585


### Эксцесс

In [447]:
df.kurtosis()

Y   -0.519363
C   -0.740015
I    0.230176
G   -0.351445
dtype: float64

### Асимметрия

In [448]:
df.skew()

Y    0.889595
C    0.780991
I   -1.059061
G    0.913836
dtype: float64

### Корреляция

In [449]:
df.corr()

Unnamed: 0,Y,C,I,G
Y,1.0,0.997251,-0.664374,0.993676
C,0.997251,1.0,-0.624497,0.99446
I,-0.664374,-0.624497,1.0,-0.602825
G,0.993676,0.99446,-0.602825,1.0


# Графики распределения

In [450]:
legend = {k: data_router[k]['ru']['short'] for k in list(data_router.keys())[:4]}
df.rename(columns=legend).iplot(kind='hist', subplots=True)
