In [None]:
def arange(
        arg1: float,
        arg2: float or None = None,
        arg3: float or None = None,
        arg4: bool or None = None):
    
    '''
    Realization of simple range (based on np.arange) with protection from 
    float large decimals, e.g. 1.100000000009 except 1.1)
    
    default:
        arg1 - start
        arg2 - stop
        arg3 - step
        arg4 - endpoint (if True: 'stop' value included in range; if False: 'stop' value not included in range)

    variations:
        arange(arg1) -> range(start=0, stop=arg1, step=1, endpoint=False)
        
        arange(arg1, arg2):
            arange(float, float) -> (start=arg1, stop=arg2, step=1, endpoint=False)
            arange(float, bool) -> range(start=0, stop=arg1, step=1, endpoint=arg2)
            
            
        arange(arg1, arg2, arg3):
            arange(float, float, float) -> (start=arg1, stop=arg2, step=arg3, endpoint=False)
            arange(float, float, bool) -> range(start=arg1, stop=arg2, step=1, endpoint=arg3)
            
        arange(arg1, arg2, arg3, arg4):
            arange(float, float, float, bool) -> range(start=arg1, stop=arg2, step=arg3, endpoint=arg4)

    dependencies:
        libraries: numpy, decimal, numbers
    '''

    # list of argument values
    arg_values = locals().values()

    # create list with decimals of arguments values
    round_idxs = []
    for i in arg_values:
        if (isinstance(i, numbers.Number) and not
            isinstance(i, bool)):
            decimals = decimal.Decimal(str(i)).as_tuple().exponent
            round_idxs.append(abs(decimals))
    # find maximum number of decimals - 
    # all values would be round to it later to avoid X.XXXXXXXXXX float
    round_dec = max(round_idxs)
    
    # True/False marker if result should be all integers
    is_int = False

    # if only one argument: arange(arg1)
    if ((arg1 is not None) & (arg2 is None) &
        (arg3 is None) & (arg4 is None)):
        # equivalent (start=0, stop=arg1, step=1, endpoint=False)
        start = 0
        stop = arg1
        # return empty array if start and stop equals
        if start == stop:
            arr = np.empty(0)
            return arr
        step = 1
        endpoint = False
        # rememeber decimal number of stop variable
        round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
        round_dec_for_stop = abs(round_dec_for_stop.exponent)
        
        if isinstance(arg1, int):
            is_int = True

    # if two arguments: arange(arg1, arg2)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is None) & (arg4 is None)):
        
        # if second argument boolean: arange(number1, True)
        if isinstance(arg2, bool):
            # equivalent (start=0, stop=arg1, step=1, endpoint=arg2)
            start = 0
            stop = arg1
            step = 1
            endpoint = arg2
            # rememeber decimal number of stop variable
            round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
            round_dec_for_stop = abs(round_dec_for_stop.exponent)
        # if second argument not boolean: arange(number1, number2)
        else:
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=False)
            start = arg1
            stop = arg2
            # return empty array if start and stop equals
            if start == stop:
                arr = np.empty(0)
                return arr
            step = 1
            endpoint = False
            # rememeber decimal number of stop variable
            round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
            round_dec_for_stop = abs(round_dec_for_stop.exponent)

        if isinstance(arg1, int) & isinstance(arg2, int):
            is_int = True

    # if three arguments: arange(arg1, arg2, arg3)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is None)):
        # if third argument boolean: arange(number1, number2, True)
        if isinstance(arg3, bool):
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=arg3)
            start = arg1
            stop = arg2
            # return empty array if start and stop equals
            if start == stop:
                arr = np.empty(0)
                return arr
            step = 1
            endpoint = arg3
            # rememeber decimal number of stop variable
            round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
            round_dec_for_stop = abs(round_dec_for_stop.exponent)
        # if third argument not boolean: arange(number1, number2, number3)
        else:
            # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=False)
            start = arg1
            stop = arg2
            # return empty array if start and stop equals
            if start == stop:
                arr = np.empty(0)
                return arr
            step = arg3
            endpoint = False
            # rememeber decimal number of stop variable
            round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
            round_dec_for_stop = abs(round_dec_for_stop.exponent)

        if (isinstance(arg1, int) & isinstance(arg2, int) &
               isinstance(arg3, int)):
            is_int = True

    # if all arguments: arange(arg1, arg2, arg4, True)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is not None)):
        # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=arg4)
        start = arg1
        stop = arg2
        # return empty array if start and stop equals
        if start == stop:
            arr = np.empty(0)
            return arr
        step = arg3
        endpoint = arg4
        # rememeber decimal number of stop variable
        round_dec_for_stop = decimal.Decimal(str(stop)).as_tuple()
        round_dec_for_stop = abs(round_dec_for_stop.exponent)

        if (isinstance(arg1, int) & isinstance(arg2, int) &
            isinstance(arg3, int)):
            is_int = True

    # arr = step * np.arange(start/step, stop/step)
    arr = np.arange(start, stop, step)
    # round array to avoid X.XXXXXXXXXXXX float
    arr = np.around(arr, decimals=round_dec)
    # if last value of arr plus step equals to stop it concatenates to arr
    last_value = arr[-1]
    # also round this value to avoid X.XXXXXXXXXXXX float (number decimals as in stop variable)
    last_value_plus_step = np.around(last_value+step, round_dec_for_stop)
    if endpoint and last_value_plus_step==stop:
        arr = np.concatenate([arr,[stop]])
    if is_int:
        arr = np.around(arr, decimals=0)
        arr = arr.astype(int)

    return arr

In [1]:
def save_session(name, dir='sessions'):
    
    if dir != 'sessions':
        dir = f'sessions/{dir}/'
    else:
        dir = 'sessions/'
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    # save session
    dill.dump_session(dir+name)

In [3]:
def calculate_sum_transferts(data, columns, regions, mapping_dict):
    '''
    Count sum of rows for every column in 'columns'.
    Rows select by region in 'index0' column (regions list)
    and OKTMO values from 'mapping_dict' ('region': [value])
    
    index1               | OKTMO  | column | Социальные выплаты | index0
    ----------------------------------------------------------------------------
    Всего                | result |        |                    | Алтайский край       
    ----------------------------------------------------------------------------
    Муниципальные районы | value  |        |                    | region     
    ----------------------------------------------------------------------------
    ...
    ----------------------------------------------------------------------------
    Городские районы     | value  |        |                    | Алтайский край 
    '''

    df = data.copy()
    for region in regions:
        if region not in mapping_dict.keys():
            print(f'Region {region} not in "Minucipal-Town Dictionary"')
        else:
            condition_result = ((df['index0'] == region) &
                                (df['index1'] == 'Всего'))
            index_result = df.loc[condition_result, : ].index.tolist()[0]
    
            for column in columns:
                codes_list = mapping_dict[region]
                condition_sum = (df['ОКТМО'].isin(codes_list))
                sum = df.loc[condition_sum, column].sum()
                df.loc[index_result, column] = sum
    return df

In [34]:
def check_oktmo_duplication(data, column_check, normalize_oktmo_list):
    df = data.copy()
    check_oktmo_duplication = []
    for region in normalize_oktmo_list.keys():
        condition = df[column_check].isin(normalize_oktmo_list[region])
        length = len(df.loc[condition])
        if length > 2:
            check_oktmo_duplication.append(region)
            
    return check_oktmo_duplication

In [37]:
def transform_replace(x, i, j):
    '''
    Replace value if it's string. Else - return same value
    '''
    if isinstance(x, str):
        return x.replace(i, j)
    else:
        return x

In [40]:
def replace_inlist(x, lst, replace, kind='in'):
    '''
    Replace value if it in or not in list
    '''
    if kind == 'in':
        if x in lst:
            x = replace
        else:
            pass
    if kind == 'not':
        if x not in lst:
            x = replace
        else:
            pass
    return x

In [None]:
def replace_elements(x, replace, in_list):
    '''
    Replaces elements in String that in list
    '''
    
    for i in in_list:
        x = x.replace(i, replace)
        
    return x

In [41]:
def extract_table_from_docx(link):
    # extract .docx variable from link
    col_raw = docx.Document(link)
    # extract all tables form .docx  
    tables = []
    for table in col_raw.tables:
        col = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
        for i, row in enumerate(table.rows):
            for j, cell in enumerate(row.cells):
                if cell.text:
                    col[i][j] = cell.text
        tables.append(pd.DataFrame(col))
    # extract first table
    col = tables[0]
    return col

In [42]:
def replace_comma_point(x):
    '''
    Replace comma with point for float values
    df = df.applymap(replace_comma_point)
    '''
    return str(x).replace(',', '.')

In [43]:
def transform_concat_rows_strings(df, column_name='index'):
    '''
    In particular column function concat rows if there are two consecutive strings in rows
    If at least NaN - skip
    | string1 |
    -----------  -> do nothing
    |   NaN   |
    -----------
    | string1 |
    -----------  -> add string2 to string1 and remove row with sring2
    | string2 |
    -----------
    '''
    drop_indexes = []
    drop_indexes1 = [0]
    for i in df[column_name].index:
        if i > 1:
            if (isinstance(df[column_name].loc[i-1], str) &
                isinstance(df[column_name].loc[i-2], str)):
                new_value = df[column_name].loc[i-2] + df[column_name].loc[i-1]
                df.loc[i-2, column_name] = new_value
                drop_indexes.append(i-1)
                drop_indexes1.append(i-2)
    df = df.drop(drop_indexes, axis=0)

    return df, drop_indexes1

In [44]:
def smoothed(x, y=None, n=300, k=3, return_type='df', datetime_index=False):
    '''
    Smooth data for plots
    
    Arguments:
    x: pd.DataFrame, pd.Series or array-type
    y: array-type
    n: length of linespace
    k: smoothing scale
    return_type: 
        - if 'array' - return x_new, y_new
        - if 'dict' - returns dict with {'x': x_new, 'y': y_new}
    datetime_index:
        If indexes of DataFrame is datetime-type, 'datetime_index' shoulde True,
        then function smoothes datetime-type indexes too.

    If x == pd.DataFrame functon returns pd.DataFrame anyway

    Libraries:
    from scipy.interpolate import make_interp_spline, BSpline
    '''
    if datetime_index:
        start = x.index[0]
        end = x.index[-1]
        time_range = \
            pd.date_range(start=start, end=end, periods=n)
        x = x.reset_index(drop=True)

    if isinstance(x, pd.DataFrame):
        x_index = x.index
        x_new = np.linspace(x_index.min(), x_index.max(), n)
        df = pd.DataFrame(index=x_new, columns=x.columns)
        for col in x.columns:
            y = x[col]
            spl = make_interp_spline(x_index, y, k=k)  # type: BSpline
            y_new = spl(x_new)
            df[col] = y_new
        # df.index = x_new
        if return_type == 'df':
            if datetime_index:
                df = pd.DataFrame(data=y_new, columns=['value'], index=time_range)
            else:
                df = pd.DataFrame(data=y_new, columns=['value'], index=x_new)
            return df
        if return_type == 'array':
            return np.array(df.index), np.array(df.iloc[:, 0])
        
    else:
        if isinstance(x, pd.Series):
            y = x.copy()
            x = x.index
        
        # n represents number of points to make between T.min and T.max
        x_new = np.linspace(x.min(), x.max(), n) 
    
        spl = make_interp_spline(x, y, k=k)  # type: BSpline
        y_new = spl(x_new)
    
        if return_type == 'dict':
            if datetime_index:
                ret_dict = {
                    'x': time_range,
                    'y': y_new
                    }
            else:
                ret_dict = {
                    'x': x_new,
                    'y': y_new
                    }
            return ret_dict
        elif return_type == 'array':
            if datetime_index:
                return time_range, y_new
            else:
                return x_new, y_new
        elif return_type == 'df':
            if datetime_index:
                df = pd.DataFrame(data=y_new, columns=['value'], index=time_range)
            else:
                df = pd.DataFrame(data=y_new, columns=['value'], index=x_new)
            return df

In [None]:
def axis_new_year(
        months=[1, 4, 7, 10],
        month_format='%b',
        year_format='%Y',
        add_year_axis=True,
        year_axis_pad=-0.105,
        language='eng',
        months_as_minor=False,
        months_pad=5,
        capitalize=True,
        ax=None):

    '''
    /// IMPORTANT: If use language=='rus' then use set_location('EN') after plt.show() or reset_location=True
        if current ax is last in figure, 
        because axis_new_year() function changes location to 'ru_RU'

    /// Also, 'language' argument, that've set in last ax, aplied to all axes of plot.  
    
    Modificate date format of plots from datetime (for example, '2021-01-01') to:
    
    ---|-------|----- ... ---|--------|-------
      Jan     Feb           Dec      Jan      
      2021                           2022         
    '''
    
    # set ax
    if ax is None: ax = plt.gca()

    # specify 1st month for major ticks and other months for minor ticks
    if months_as_minor:
        # minor months - all except 1
        months_minor = list(filter(lambda i: i != 1, months))
        months_major = 1
        loc_month_minor = mdates.MonthLocator(bymonth=months_minor)
    else:
        months_major = months

    # major ticks
    loc_month_major = mdates.MonthLocator(bymonth=months_major)
    # set format of months labels
    fmt_month = mdates.DateFormatter(month_format)
    # major ticks every year
    loc_year = mdates.YearLocator()
    # set format of year labels
    fmt_year = mdates.DateFormatter(year_format)
    
    # set month major ticks
    ax.xaxis.set_major_locator(loc_month_major)
    ax.xaxis.set_major_formatter(fmt_month)
    # set month minor ticks if necessary
    if months_as_minor:
        ax.xaxis.set_minor_locator(loc_month_minor)
        ax.xaxis.set_minor_formatter(fmt_month)
        ax.tick_params(axis='x', which='minor', pad=months_pad)

    # set secondary axis with major ticks as year
    if add_year_axis:
        second_xaxis = ax.secondary_xaxis(year_axis_pad)
        second_xaxis.xaxis.set_major_locator(loc_year)
        second_xaxis.xaxis.set_major_formatter(fmt_year)

    # hide the second x-axis spines and ticks
    second_xaxis.spines['bottom'].set_visible(False)
    second_xaxis.tick_params(bottom=False)

    # translate months if necessary
    if language=='eng':
        locale.setlocale(locale.LC_ALL,'en_US')
    elif language=='rus':
        locale.setlocale(locale.LC_ALL,'ru_RU.UTF-8')
    else:
        print("'language' have to be 'eng' or 'rus'")

    # capialize months if necessary
    if capitalize:
        function = lambda x,pos: mdates.DateFormatter(month_format)(x,pos).capitalize()
        ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(function))
        ax.xaxis.set_minor_formatter(matplotlib.ticker.FuncFormatter(function))

In [45]:
def federal_district_concat(data, column_name, federal_district_list):
    '''
    Concat two rows with federal district names

    Raw:
    | column_name       |
    -----------------------------
    | Южный             | NaN   |
    -----------------------------  -> concat this rows and drop one with NaN
    | федеральный округ | 12345 |
    -----------------------------

    Result:
    ---------------------------------
    | Южный федеральный округ | 12345
    ---------------------------------

    Arguments:
    df,
    column_name - column with regions names
    federal_district_list - list with first name of FD ('Южный', 'Центральный', 'Северо-Западный')
    
    '''
    df = data.copy()
    
    for index in df.index:
        if df.loc[index, column_name] in federal_district_list:
            new_value = (df.loc[index, column_name]
                         + ' '
                         + df.loc[index+1, column_name])
            df.loc[index+1, column_name] = new_value
            df = df.drop(index, axis=0)
    return df

In [None]:
def isna(df):
    ret = df[df.isna().any(axis=1)]
    shape = df[df.isna().any(axis=1)].shape
    if shape[0] > 0:
        return ret
    else:
        print("There're no NaN values in DataFrame")

In [None]:
def is_nan(df):
    ret = df[df.isna().any(axis=1)]
    shape = df[df.isna().any(axis=1)].shape
    if shape[0] > 0:
        return ret
    else:
        print("There're no NaN values in DataFrame")

In [None]:
def is_float(x):
    '''
    Returns True if x could be convered to float
    '''
    try:
        float(x)
        return True
    except ValueError:
        return False

In [47]:
def to_float(x, errors=False):
    '''
    Convert x to Float if possible
    '''
    try:
        return float(x)
    except ValueError:
        return x
        if errors:
            print(f'ValueError: {x}')
    except TypeError:
        return x
        if errors:
            print(f'TypeError: {x}')

In [None]:
def to_int(x, errors=False):
    '''
    Convert x to Int if possible
    '''
    try:
        return int(x)
    except ValueError:
        return x
        if errors:
            print(f'ValueError: {x}')
    except TypeError:
        return x
        if errors:
            print(f'TypeError: {x}')

In [None]:
def to_string(x, errors=False):
    '''
    Convert x to String if possible
    '''
    try:
        return str(x)
    except ValueError:
        return x
        if errors:
            print(f'ValueError: {x}')
    except TypeError:
        return x
        if errors:
            print(f'TypeError: {x}')

In [None]:
def sum_rows(data, column, values_list, value):
    '''
    values_list = [value1, value2]
    Order is critical.
    First value in list (value1) - row with NaNs
    
    Input: 
    | column | ... | ... | ... | ... | ... | ... | ..... |
    ------------------------------------------------------
    | value1 | NaN | NaN | NaN | NaN | NaN | 213 | 13123 | 
    ------------------------------------------------------
    | value2 | 123 | 354 | 435 | 435 | 345 | 555 | 55555 |

    Result:
    | value | 123 | 354 | 435 | 435 | 345 | 213 | 13123 |
    
    '''
    df = data.copy()
    # find indexes of rows with 'valu1' and 'value2'
    index1 = df[df[column]==values_list[0]].index
    index2 = df[df[column]==values_list[1]].index
    # find columns names in row with 'value1', that contains NaNs
    columns_fill = df.loc[index1].columns[df.loc[index1].isna().any()].tolist()
    # fill cells with NaN in this row by values from row with 'value2'
    df.loc[index1, columns_fill] = df.loc[index2, columns_fill].values
    # rename 'value1' to 'value'
    df.loc[index1, column] = value
    # drop 'value2' row
    df = df.drop(index2, axis=0)

    return df

In [None]:
def plot_timemarker(
        text, x, y_text, y_line, delta, color_text=None, color_scatter='#AF4035',
        ha='left', weight='bold', size=8, show=None, ax=None, **kwargs):
    
    if ha == 'right':
        delta = -delta

    if ax is None: ax = plt.gca()
 
    # point
    ax.scatter(
    x=x,
    y=y_text, color=color_scatter, s=5, zorder=6)
    # line
    ax.axvline(
        x=x,
        ymin=0, ymax=y_line, lw=0.85, ls=':',
        color=color_scatter, alpha=0.75, zorder=0)
    # text
    x_text = x + delta
    ax.text(
        x=x_text,
        y=y_text, s=text, ha=ha, va='center', weight=weight,
        size=size, color=color_text, alpha=1, **kwargs)

    if show is None:
        pass
    else:
        plt.show()

In [None]:
def plot_fill_between(x, y1, y2, color, alpha=0.1, ax=None, **kwargs):
    
    if ax is None:
        plt.fill_between(
            x, y1, y2,
            interpolate=True, color=color, ec='none', alpha=alpha, **kwargs)
    else:
        ax.fill_between(
            x, y1, y2,
            interpolate=True, color=color, ec='none', alpha=alpha, **kwargs)

In [None]:
def transform_pci(data):
    
    df = data.copy()

    df = df[2:-3].copy()
    df.iloc[0,0] = '-'
    df.columns = df.iloc[0, :]
    df = df[2:].copy()
    df.columns = [to_int(i) for i in df.columns]
    df = df.drop(16, axis=0)
    df.loc[17, '-'] = 'Год'
    df = df.set_index('-', drop=True)
    df.index.name = None
    replace_dict = {'1)': '', '2)': '', ',': '.'}
    df = df.astype(str)
    for column in df.columns:
        for key, value in replace_dict.items():
            df[column] = df[column].str.replace(key, value)
    df = df.astype(float)
    df = transform_round(df, 1)
    # extract last row with 'Год' from 'pci_month'
    first_row = df.iloc[-1].to_frame().T
    # add it as first row to 'pci_month'
    df = pd.concat([first_row, df], axis=0)
    # remove last row from 'pci_month'
    df = df.iloc[:-1]
    
    return df

In [None]:
def pci_months_data_preparation(data, years, drop_first_rows=0):
    
    df = data.copy()
    year1 = years[0]
    year2 = years[1]
    slice_ = slice(drop_first_rows, None)
    df = df.loc[:, year1:year2][slice_].copy()
    df = df.melt().iloc[:, 1:]
    df = df.reset_index(drop=True)
    xticks = df.index.tolist()
    labels = []
    for year in range(year1, year2+1):
        for month in range(1,13):
            labels.append('01-' + str(month).zfill(2) + '-' + str(year))
    return df, xticks, labels

In [None]:
def plot_pci_curve(data, step=6, ylim=(90, 140), rotation=45, ax=None, **plot_kwargs):
    
    if ax is None:
        plt.plot(*smoothed(data[0], return_type='array'), **plot_kwargs)
        plt.xticks(ticks=data[1][::step], labels=data[2][::step], rotation=rotation)
        plt.ylim(ylim)
    else:
        ax.plot(*smoothed(data[0], return_type='array'), **plot_kwargs)
        ax.set_xticks(ticks=data[1][::step], labels=data[2][::step], rotation=rotation)
        ax.set_ylim(ylim)

In [None]:
def saveit_excel(data, name, path, sheet):
    
    path_ = path+name+'.xlsx'
    if os.path.exists(path_):
        writer = pd.ExcelWriter(path=path_, engine = 'openpyxl')
    else:
        writer = path_
    data.to_excel(
        excel_writer=writer,
        sheet_name=sheet,
        float_format=','
    )      

In [None]:
def align_center(x):
    return ['text-align: center' for x in x]

In [None]:
def saveit_excel(data, filename, path, sheet):
    
    if not os.path.exists(path):
        os.mkdir(path)
    # create full path from directory (path argument) and filename
    path_ = path + filename + '.xlsx'
    # if such file exist - append  to it new sheet
    if os.path.exists(path_):
        with pd.ExcelWriter(
            path_,
            mode="a",
            engine="openpyxl",
            if_sheet_exists="replace",
        ) as writer:
            data.to_excel(
                excel_writer=writer,
                sheet_name=sheet
            )
            print(f"'{sheet}' sheet created if file '{filename + '.xlsx'}'")
    # if not exist - create new .xlsx
    else:
        data.to_excel(
            excel_writer=path_,
            sheet_name=sheet
        )
        print(f"File '{filename}' created")

In [None]:
def loadit(name, dir='files', type='.pkl'):
    result = pd.read_pickle(f'{dir}{name}{type}')
    return result

In [None]:
def saveit(file, name, dir='files'):
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    # save file
    filehandler = open(f'{dir}/{name}.pkl', 'wb') 
    pickle.dump(file, filehandler)
    filehandler.close()
    print(f"File '{name}' saved in directory '{dir}'")

In [None]:
def savefig(name, dir='img', format='png', dpi=100, transparent=True,  figure=None, **kwargs):
    '''
    Saves figure as PNG to 'img/' dir
    '''
    if figure is None:
       figure = fig
    if dir is None:
        dir = 'img'
    else:
        if dir != 'img':
            dir = f'{dir}'
        else:
            pass
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    figure.savefig(
        f'{dir}/{name}.{format}',
        transparent=transparent,
        bbox_inches='tight',
        dpi=dpi, 
        format=format,
        **kwargs
    )
    print(f"Image '{name}.{format}' successfully saved into '{dir}' directory")

In [None]:
def remove_duplicated_whitespaces(x):
    return str.join(' ', str(x).split())

In [None]:
def reshape_series(series, type='feature'):
    
    if type == 'feature':
        series_reshaped = series.values.reshape(-1,1)
        return series_reshaped
    elif type == 'sample':
        series_reshaped = series.values.reshape(1,-1)
        return series_reshaped
    else:
        print("'type' argument must be 'feature' or 'sample'")

In [None]:
def replace_indict(x, replace_dict, replacement_type='full'):
    
        for key in replace_dict.keys():
            
            if replacement_type == 'part':
                x = x.replace(key, replace_dict[key])
                
            elif replacement_type == 'full':
                if x == key:
                    x = replace_dict[key]
                else:
                    pass
                    
            elif replacement_type == 'both':
                if x == key:
                    x = replace_dict[key]
                else:
                    x = x.replace(key, replace_dict[key])
        return x

In [None]:
def transform_indexes(data):

    '''
    Argument - cutted dataframe
    '''
    
    df = data.copy()
    df = df.drop('Unnamed: 1', axis=1)
    df.iloc[0, 0] = 'idx'
    # drop duplicate whitespaces
    df.iloc[0, :] = df.iloc[0, :].map(remove_duplicated_whitespaces)
    df.columns = df.iloc[0, :]
    df = df[1:].copy()
    df['idx'] = df['idx'].map(remove_duplicated_whitespaces)
    df = df.set_index('idx', drop=True)
    df.index.name = None
    df.columns.name = None
    df.index = [str(i).lower() for i in df.index]
    df = df.T
    df = df.astype(float)
    df = df.round(1)

    return df

In [None]:
def reset_location(loc='EN'):
    if loc=='EN':
        locale.setlocale(locale.LC_ALL,'en_US')
    elif loc=='RU':
        locale.setlocale(locale.LC_ALL,'ru_RU.UTF-8')
    else:
        print("Location have to be 'EN' or 'RU'")

In [None]:
def set_location(loc='EN'):
    if loc=='EN':
        locale.setlocale(locale.LC_ALL,'en_US')
    elif loc=='RU':
        locale.setlocale(locale.LC_ALL,'ru_RU.UTF-8')
    else:
        print("Location have to be 'EN' or 'RU'")

In [None]:
def plot_index_product(data_list, product_names):
    '''
    Prepare data from p10 etc for plot

    Arguments:
    data_list: list with df
    
          | product
    --------------
    month | index_value

    product_names: list with names of product in datasets, for example 'овощи т', if different
                    if names are the same, product_names = [procut], for example ['овощи т']
    '''
    
    data_freq = len(data_list)
    if len(product_names) == 1:
        product_names = product_names * data_freq
    plot_data = pd.DataFrame()
    for df, name in zip(data_list, product_names):
        plot_data = pd.concat([plot_data, df[name]], axis=0)
    plot_data.columns = [product_names[-1]]
    plot_data.index = [dt.datetime.strptime(i, '%B %Y') for i in plot_data.index]
    # change date value from beginning of the month to end
    # plot_data.index = plot_data.index + pd.offsets.MonthEnd(0)

    return plot_data

In [None]:
def is_equal(data1, data2):

    if data1.equals(data2):
        print('Equal')
    else:
        # display rows with differences
        display(data1[~data1.apply(tuple, 1).isin(data2.apply(tuple, 1))])

In [None]:
def transform_prices_9_6(data_raw, month, year, insert_column=False):
    
    df = data_raw.copy()
    if insert_column:
        df.insert(loc=0, column='delete', value=df.iloc[:, 0].copy())
    # remove unnecessary columns and rows
    df = df.loc[[2, 4], :].copy()
    df = df.iloc[:, 1:].copy()
    # remove spaces in products
    df.loc[2] = df.loc[2].map(remove_duplicated_whitespaces)
    # create index cell
    df.iloc[0, 0] = 'idx'
    # change 'Российская Федерация' to month and year
    df.iloc[1, 0] = month + ' ' + year
    # make row with loc 2 as columns
    df.columns = df.loc[2]
    df = df.drop(2)
    df.columns.name = None
    # make 'idx' column as index
    df = df.set_index('idx', drop=True)
    df.index.name = None

    return df

In [None]:
def last_row_to_first(data):
    '''
    Make the last row of DataFrame to be the first
    '''
    df = data.copy()
    # extract last row with 'Год' from 'pci_month'
    first_row = df.iloc[-1].to_frame().T
    # add it as first row to 'pci_month'
    df = pd.concat([first_row, df], axis=0)
    # remove last row from 'pci_month'
    df = df.iloc[:-1].copy()

    return df

In [None]:
def cut_rows(data, start, end):
    '''
    Cut n=='start' rows at the beginning of DataFrame and 
    n=='end' rows at the end of DataFrame 
    '''
    if end == 0:
        slice_ = (slice(start, None), slice(None, None))
    else:
        # create slice, that cut rows and stay all columns
        slice_ = (slice(start, -end), slice(None, None))
    # unpack slice_ in .iloc
    df = data.iloc[*slice_].copy()

    return df

In [None]:
def replace_with_dict(x, replace_dict):
    '''
    In argument 'x' replaces all replace_dict keys by replace_dict values
    '''
    for key in replace_dict.keys():
        x = x.replace(key, replace_dict[key])
        
    return x

In [None]:
def transform_indexes_11(data, cut_rows_start, cut_rows_end):
    
    df = cut_rows(data, start=cut_rows_start, end=cut_rows_end)
    # remove duplicates in 1st column
    df.iloc[:, 0] = [remove_duplicated_whitespaces(x) for x in df.iloc[:, 0]]
    # find and remove row with 'К предыдущему месяцу'
    idx_drop = df[df.iloc[:, 0] == 'К предыдущему месяцу'].index.item()
    df = df.drop(idx_drop, axis=0)
    # rename value for index column
    df.iloc[0, 0] = 'idx'
    # make 1st row as column
    df.columns = df.iloc[0, :]
    df.columns.name = None
    df = df.iloc[1:, :].copy()
    # last row as 'год'
    df.iloc[-1, 0] = 'Год'
    # move last row to up
    df = last_row_to_first(df)
    df = df.set_index('idx', drop=True)
    df.index.name = None
    # replace_dict
    df.columns = [str(x) for x in df.columns]
    replace_dict = {'1)': '', '2)': '', '3)': '', '4)': '', '5)': '', '6)': '', '7)': '', '8)': ''}
    df.columns = [replace_with_dict(x, replace_dict) for x in df.columns]
    df.columns = [int(float(x)) for x in df.columns]
    df = df.astype(float)
    df = df.round(1)
    
    return df

In [38]:
def transform_to_float(x):
    try:
        x = float(x)
        return x
    except ValueError:
        return x

In [39]:
def transform_round(x, scale=1):
    if not isinstance(x, str):
        return round(x, scale)
    else:
        return x

In [None]:
def industry_indexes_product_11_11(data, cut_rows_start, cut_rows_end):

    replace_dict = {
        '1\\)': '', '2\\)': '', '3\\)': '', '4\\)': '',
        '5\\)': '', '6\\)': '', '7\\)': '', '8\\)': ''}
    
    df = cut_rows(data, start=cut_rows_start, end=cut_rows_end)
    df = df.drop(df.columns[1], axis=1)
    df.iloc[0, 0] = 'idx'
    df.iloc[:, 0] = [remove_duplicated_whitespaces(x) for x in df.iloc[:, 0]]
    df.iloc[:, 0] = df.iloc[:, 0].replace(replace_dict, regex=True)
    df.iloc[0, :] = [remove_duplicated_whitespaces(x) for x in df.iloc[0, :]]
    df.iloc[0, :] = df.iloc[0, :].replace(replace_dict, regex=True)
    df.columns = df.iloc[0, :]
    df.columns.name = None
    df = df.iloc[1:, :].copy()
    df = df.set_index('idx', drop=True)
    df.index.name = None
    df = df.replace(',', '.', regex=True)
    
    df = df.replace(replace_dict, regex=True)
    df = df.astype(float)
    df = df.round(2)
    df = df.T

    return df

In [None]:
def months_translate(x, kind='rus-eng', add_year=None, capitalize=True):
    '''
    Transform russian month name to english
    'январь' --> 'January'
    
    if add_year==2021: 'январь' --> 'January 2021'
    if capitalize==False: 'январь' --> 'january'
    '''
    # lowercase data
    x_old = x.lower()
    # create repalce dict
    if kind == 'rus-eng':
        repalce_dict = {
            'январь': 'january',
            'февраль': 'february',
            'март': 'march',
            'апрель': 'april',
            'май': 'may',
            'июнь': 'june',
            'июль': 'july',
            'август': 'august',
            'сентябрь': 'september',
            'октябрь': 'october',
            'ноябрь': 'november',
            'декабрь': 'december'
        }
    elif kind == 'eng-rus':
        repalce_dict = {
            'january': 'январь',
            'february': 'февраль',
            'march': 'март',
            'april': 'апрель',
            'may': 'май',
            'june': 'июнь',
            'july': 'июль',
            'august': 'август',
            'september': 'сентябрь',
            'october': 'октябрь',
            'november': 'ноябрь',
            'december': 'декабрь'
        }
    else:
        print("'kind' must be 'rus-eng' or 'eng-rus'")
    # for all keys and values in dict, replace x by value if x and key are equal
    for k, v in repalce_dict.items():
        if x_old == k:
            x_new = v
        else:
            pass

    if capitalize:
        x_new = x_new.capitalize()

    if add_year is not None:
        x_new = x_new + ' ' + str(add_year)

    return x_new

In [None]:
def to_date(x, kind='%B %Y', translate=False):
    '''
    String to Date
    '''
    months_list = [
        'январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль',
        'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь',
        'Январь', 'Февраль', 'Март', 'Апрель', 'Май', 'Июнь', 'Июль',
        'Август', 'Сентябрь', 'Октябрь', 'Ноябрь', 'Декабрь'
    ]
    # if months in Russian
    if translate:
        # split string to list
        x = x.split()
        # for every element in list
        for i in x:
            # if element is month
            if i in months_list:
                # find its index
                i_index = x.index(i)
                # translate element and access new value with it
                new_value = months_translate(i, kind='rus-eng', capitalize=True)
                # change old month to new one
                x[i_index] = new_value
        # join all elements of list to one string
        x = ' '.join(x)
    # transform string to date
    x = dt.datetime.strptime(x, kind)

    return x

In [None]:
def normalized(data, reshape=True, return_type='df'):
    
    if (isinstance(data, pd.Series) | 
        isinstance(data, pd.DataFrame)):
        idxs = data.index.copy()
    if reshape:
        data = np.array(data).reshape(-1, 1)
    data_new = MinMaxScaler().fit_transform(data)
    if return_type == 'df':
        data_new = pd.DataFrame(data=data_new, index=idxs)
    elif return_type == 'array':
        pass
    else:
        print("return_type must be 'df' or 'array'")
        return None
        
    return data_new

In [None]:
def np_index(array, value):
    '''
    Return index of Value in Array
    '''
    return np.where(array == value)[0][0]

In [None]:
def normalized_by_first(data, return_type='df'):

    '''
    Normalize kind: 
        first_value == first_value
        second_value = second_value / first_value
        third_value = third_value / first_value
    '''
    
    first_value = list(data)[0]
    data_new = [(x/first_value) for x in data]
    
    if return_type == 'df':
        df = pd.DataFrame(data=data_new, index=data.index)
        return df
    if return_type == 'series':
        series = pd.Series(data=data_new, index=data.index)
        return series
    elif return_type == 'array':
        array = np.array(data_new)
        return array
    elif return_type == 'list':
        lst = list(data_new)
        return lst
    else:
        print("'return_type' must be 'df', 'series', 'array', 'list'")
    
    return data_new

In [None]:
def load_session(name, dir='sessions'):
    
    if dir != 'sessions':
        dir = f'sessions/{dir}/'
    else:
        dir = 'sessions/'
    # save session
    dill.load_session(dir+name)

In [2]:
def remove_charachters(string_var, chars, where='start'):
    
    if where == 'start':
        string_var = string_var.lstrip(chars)
    if where == 'end':
        string_var = string_var.rstrip(chars)
    if where == 'inside':
        string_var = string_var.replace(chars, '')
        
    return string_var

In [3]:
def transform_first_rows(df, header_loc=0, rows_drop=2):
    '''
    Если первый ряд - это год
    Если последующие несколько рядов - отбросить
    Если две первых колонки - это Регион и Код
    '''
    # make row 0 as columns
    df.columns = df.loc[header_loc].astype(str)
    loc_drop = list(np.arange(0, rows_drop))
    df = df.drop(loc_drop)

    return df

In [4]:
def merge_columns(x):
    return ','.join(x[x.notnull()].astype(str))

In [5]:
def merge_columns_similar_names(df):
    df_new = (df
              .groupby(level=0, axis=1)
              .apply(lambda x: x.apply(merge_columns, axis=1)))
    return df_new

In [6]:
def merge_two(how, data, col1, col2, col_new):
    df = data.copy()
    if how=='index':
        df = df.T
    if how=='columns':
        pass
    df = df.fillna(0)
    df = df.replace('-', 0)

    df[col_new] = df[col1] + df[col2]
    
    col_move = df.pop(col_new)
    idx = df.columns.get_loc(col1)
    df.insert(idx, col_new, col_move)
    
    df = df.drop([col1, col2], axis=1)
    df = df.copy()

    if how=='index':
        return df.T
    if how=='columns':
        return df

In [7]:
def drop_code_column(df):
    df = df.drop(['Код'], axis=1)
    return df

In [8]:
def transpose_data(df):
    '''
    Транспонирование данных из вида:
    Регион   | 2016   | 2017    | итд.
    ----------------------------------
    РФ       | 1234   | 12312   | итд.
    Белгород | 234234 | 2343242 | итд.
    
    в вид:

         | РФ     | Белгород | итд.
    ----------------------------------
    2016 | 1234   | 12312    | итд.
    2017 | 234234 | 2343242  | итд.
    '''

    # transpose
    df = df.T
    df.index.name = None
    # Region row as columns
    df.columns = df.loc['Регион']
    df = df.drop('Регион')
    df.columns.name = None

    return df

In [9]:
def transform_population_growth(data_raw, regions=None, unit=None):
    
    df = data_raw.copy()
    # change '2016 г.' type to '2016'
    # loc_c - all values not NaN
    loc_c = (~df.loc[0].isna())
    df.loc[0, loc_c] = \
        df.loc[0, loc_c].apply(lambda x: int(remove_charachters(x, ' г.', 'end')))
    # first two rows
    df = transform_first_rows(df)
    # create region codes
    codes = df[['Регион', 'Код']]
    # drop region column
    df = drop_code_column(df)
    # transpose
    df = transpose_data(df)
    # drop regions
    if regions is not None:
        df = df[regions]
    # unit
    df.columns.name = unit
    # replace empty values by '-'
    df = df.replace('', '-')
    # fillna by '-'
    df = df.fillna('-')

    return df

In [10]:
def transform_population_town(data_raw, regions=None, unit=None):
    
    df = data_raw.copy()
    # change 'яеварь 2016 г.' type to '2016'
    # loc_c - all values not NaN
    loc_c = (~df.loc[0].isna())
    df.loc[0, loc_c] = \
        df.loc[0, loc_c].apply(lambda x: x[-7:-3])
    # first two rows
    df = transform_first_rows(df, rows_drop=1)
    # create region codes
    codes = df[['Регион', 'Код']]
    # drop region column
    df = drop_code_column(df)
    # transpose
    df = transpose_data(df)
    # drop regions
    if regions is not None:
        df = df[regions]
    # unit
    df.columns.name = unit
    # replace empty values by '-'
    df = df.replace('', '-')
    # fillna by '-'
    df = df.fillna('-')

    return df

In [11]:
def transform_data_demography(
        data, federal_districts_names_list, regions_names_list,
        regions_replace_dict, regions_drop_list,
        drop_end_rows=1, drop_locs=None):
    
    df = data.copy()
    slice_ = slice(None, -drop_end_rows)
    df = df.iloc[slice_].copy()
    if drop_locs:
        df = df.drop(drop_locs, axis=0)
    df = df.drop('Unnamed: 1', axis=1)
    df.iloc[0] = [str(i)[0:4] for i in df.iloc[0]]
    df.iloc[0, 0] = 'Регион'
    df.columns = df.iloc[0, :]
    df.columns = [to_int(i) for i in df.columns]
    df = df.drop(0, axis=0)
    df['Регион'] = [i.strip() for i in df['Регион']]
    df = df[~df['Регион'].isin(federal_districts_names_list)].copy()
    df = df[~df['Регион'].isin(regions_drop_list)].copy()
    df['Регион'] = df['Регион'].replace(regions_replace_dict)
    df = df.set_index('Регион', drop=True)
    df.index.name = None
    df = df[df.index.isin(regions_names_list)].copy()

    return df

In [12]:
def transform_data_income(
        data, first_rows_to_drop=3, header_loc1=2, header_loc2=3, drop_first_columns=4):
    
    df = data.copy()
    df = df.fillna('-')
    
    loc_c = (~(df.loc[2] == '-'))
    
    try:
        df.loc[header_loc1, loc_c] = \
            df.loc[header_loc1, loc_c].apply(lambda x: int(remove_charachters(x, ' год', 'end')))
    except ValueError:
        df.loc[header_loc1, loc_c] = \
            df.loc[header_loc1, loc_c].apply(lambda x: int(remove_charachters(x, ' год**', 'end')))
    
    for i in df.columns:
        if 'год' in df.loc[header_loc2, i]:
            df.loc[header_loc2, i] = 'Год'
    
    # location of header number 1
    
    # fill NaNs by '-'
    # df.loc[header_loc1] = df.loc[header_loc1].fillna('-')
    # remember j as '-' for loop start
    j = '-'
    # go through row by index
    for i in df.columns:
        # if value in row not '-'
        if df.loc[header_loc1, i] != '-':
            # remember this value in var j
            j = df.loc[header_loc1, i]
        else:
            # if value in row == '-', replace it by remembered value in j
            df.loc[header_loc1, i] = j
            
    # create list of tuples for multi index columns
    multi_index_columns = []
    for i in df.columns:
        col1 = str(df.loc[header_loc1, i])
        col2 = str(df.loc[header_loc2, i])
        multi_index_columns.append((col1, col2))
    
    df.columns=pd.MultiIndex.from_tuples(multi_index_columns, names=['Год', 'Квартал'])
    df = df.drop(range(drop_first_columns), axis=0)

    # first column as index
    df.index = df.iloc[:, 0]
    df.index.name = None
    df = df.drop(df.iloc[:, 0].name[0], axis=1)
    # drop last two rows
    df = df.iloc[:-2, :]
    # managing columns with similar names 
    df = merge_two(
        how='index',
        data=df,
        col1='Дальневосточный федеральный округ (до 2018г.)',
        col2='Дальневосточный федеральный округ (с 2018г.)',
        col_new='Дальневосточный федеральный округ'
    )
    df = merge_two(
        how='index',
        data=df,
        col1='Сибирский федеральный округ (до 2018г.)',
        col2='Сибирский федеральный округ (с 2018г.)',
        col_new='Сибирский федеральный округ'
    )
    df = merge_two_similar('index', df, 'Забайкальский край')
    df = merge_two_similar('index', df, 'Республика Бурятия')
    # drop federal districts
    df = df[~df.index.isin(federal_districts_names_list_xlsx)]
    # transponse
    # df = df.T
    
    return df

In [35]:
def normalize_nenec(data, column):
    df = data.copy()
    
    nenec_index_loc = (df['ОКТМО'] == '11')
    
    if len(df.loc[nenec_index_loc, :]) > 1:
        nenec_index = df[nenec_index_loc].iloc[-1].name
        df.loc[nenec_index, column] = \
            'Ненецкий автономный округ (Архангельская область)'

    return df

In [36]:
def normalize_khantymanci(data, column):
    df = data.copy()
    
    khantymanci_index_loc = (df['ОКТМО'] == '71')

    if len(df.loc[khantymanci_index_loc, :]) == 2:
        khantymanci_index = df[khantymanci_index_loc].iloc[-1].name
        df.loc[khantymanci_index, 'index0'] = \
            'Ханты-Мансийский автономный округ - Югра (Тюменская область)'

    if len(df.loc[khantymanci_index_loc, :]) == 3:
        khantymanci0_index = df[khantymanci_index_loc].iloc[0].name
        df.loc[khantymanci0_index, 'index0'] = 'Тюменская область'
        
        khantymanci1_index = df[khantymanci_index_loc].iloc[1].name
        df.loc[khantymanci1_index, 'index0'] = \
            'Ханты-Мансийский автономный округ - Югра (Тюменская область)'
        
        khantymanci2_index = df[khantymanci_index_loc].iloc[2].name
        df.loc[khantymanci2_index, 'index0'] = \
            'Ямало-Ненецкий автономный округ (Тюменская область)'
        
    return df

In [13]:
def merge_two_similar(how, data, col):
    
    df = data.copy()
    if how == 'index':
        df = df.T
    if how == 'column':
        pass
    # values for new col as sum of all duplicate columns
    new_col = df[col].sum(axis=1)
    # columns names without duplicates
    new_col_names = []
    for i in df.columns:
        if i not in new_col_names:
            new_col_names.append(i)
    # drop all duplicate columns
    df = df.drop(col, axis=1)
    # create new column with sum of all duplicate columns
    df[col] = new_col
    # make the order, where new column stays at the position of
    # first duplicate column
    df = df[new_col_names].copy()
    if how == 'index':
        return df.T
    if how == 'column':
        return df

In [14]:
def get_data_two_level(data, level0=None, level1=None, indexes=None, kind='column'):
    
    df = data.copy()
    # check 'kind' argument
    if kind == 'index':
        df = df.T
    elif kind == 'column':
        pass
    else:
        print("'kind' argument must be 'column' or 'index'")
    # turn 'level' arguments to slice
    if (level0 is None) & (level1 is None):
        return df
    if level0 is None:
        level0 = slice(level0)
    if level1 is None:
        level1 = slice(level1)
    # adress to levels
    df = df.loc[:, (level0, level1)]
    # drop multiindex level0 if both 'levels' are single
    if isinstance(level0, str) & isinstance(level1, str):
        if isinstance(df, pd.Series):
            df = df.to_frame()
            df.columns = df.columns.droplevel(1)
            df.columns.name = data.columns.names[0]
        else:
            df.columns = df.columns.droplevel(0)
    else:
        if isinstance(level0, str):
            df.columns = df.columns.droplevel(0)
        if isinstance(level1, str):
            df.columns = df.columns.droplevel(1)
    # return 
    if kind == 'index':
        return df.T
    if kind == 'column':
        return df

In [15]:
def transform_strings_in_row(data, row_index, charachters, where='start'):
    
    df = data.copy()
    # columns with value != np.nan
    row_columns = ~pd.isna(df.loc[row_index])
    try:
        df.loc[row_index, row_columns] = \
            df.loc[row_index, row_columns].apply(lambda x: int(remove_charachters(x, charachters, where)))
    except ValueError:
        df.loc[row_index, row_columns] = \
            df.loc[row_index, row_columns].apply(lambda x: int(remove_charachters(x, charachters+'**', where)))

    return df

In [16]:
def transform_fill_values_by_previous(data, kind='row', row_index=None, column_name=None):
    
    df = data.copy()
    j = np.NaN
    if kind == 'row':
        # go through row by column
        for i in df.columns:
            # if value in row not NaN
            if not pd.isna(df.loc[row_index, i]):
                # remember this value in var j
                j = df.loc[row_index, i]
            else:
                # if value in row equals '-', replace it by remembered value in j
                df.loc[row_index, i] = j
    elif kind == 'column':
        # go through column by index
        for i in df.index:
            # if value in column not NaN
            if not pd.isna(df.loc[i, column_name]):
                # remember this value in var j
                j = df.loc[i, column_name]
            else:
                # if value in column equals '-', replace it by remembered value in j
                df.loc[i, column_name] = j
    else:
        print("Argument 'kind' must be 'row' or 'column'")
    return df

In [17]:
def transform_string_replace_if_contains(data, row_index, contains, replace):
    
    df = data.copy()
    for i in df.columns:
        value = df.loc[row_index, i]
        if contains in str(value):
            df.loc[row_index, i] = replace
            
    return df

In [18]:
def transform_string_replace_value_if_contains(
        value, contains, replace, full_list=False):
    
    if isinstance(contains, str):
        if contains in value:
            value = replace
    if isinstance(contains, list):
        if full_list:
            if all([x in value for x in contains]):
                value = replace
        else:
            for i in contains:
                if i in str(value):
                    value = replace
                    
    return value

In [19]:
def transform_string_replace_value_if_not_contains(
        value, contains, replace, full_list=False):
    
    if isinstance(contains, str):
        if not contains in value:
            value = replace
    if isinstance(contains, list):
        if full_list:
            if not all([x in value for x in contains]):
                value = replace
        else:
            if not any([i in value for i in contains]):
                value = replace
            else:
                pass

    return value

In [20]:
def transform_make_header_from_rows(data, rows_index, names=None):
    
    df = data.copy()
    
    if isinstance(rows_index, list):
        # create list of tuples for multiindex columns
        multi_index_columns = []
    
        for i in df.columns:
            multi_index_tuple = ()
            for index in rows_index:
                multi_index_tuple = multi_index_tuple + (str(df.loc[index, i]),)
            multi_index_columns.append(multi_index_tuple)
    
        df.columns=pd.MultiIndex.from_tuples(multi_index_columns, names=names)
        
    else:
        df.columns = df.loc[rows_index]
    
    df = df.drop(rows_index, axis=0)
    
    return df

In [21]:
def transform_make_index_from_columns(data, columns_names, names=None):
    
    df = data.copy()
    # create list of tuples for multiindex columns
    multi_index_indexes = []

    for i in df.index:
        multi_index_tuple = ()
        for column in columns_names:
            multi_index_tuple = multi_index_tuple + (str(df.loc[i, column]),)
        multi_index_indexes.append(multi_index_tuple)

    df.index = pd.MultiIndex.from_tuples(multi_index_indexes, names=names)
    
    return df

In [22]:
def transform_cut_rows(data, rows_drop_start, rows_drop_end):
    
    df = data.copy()
    if rows_drop_end != 0:
        df = df.iloc[rows_drop_start:-rows_drop_end, :]
    else:
        df = df.iloc[rows_drop_start:, :]

    return df

In [23]:
def transform_column_as_index(data, kind='iloc', iloc=None, name=None):
    
    df = data.copy()
    if kind == 'iloc':
        df.index = df.iloc[:, iloc]
        column_name = df.iloc[:, iloc].name
        df = df.drop(column_name, axis=1)
        df.index.name = None
    if kind == 'name':
        df = data.copy()
        df.index = df.loc[:, name]
        df.index.name = None
        df = df.drop(name, axis=1)

    return df

In [24]:
def transform_cut_with_list(data, values_list, type='index', kind='stay'):

    df = data.copy()
    if (kind=='stay') & (type=='index'):
        df = df[df.index.isin(values_list)]
    elif (kind=='drop') & (type=='index'):
        df = df[~df.index.isin(values_list)]
    elif (kind=='stay') & (type=='column'):
        df = df.loc[:, df.columns.isin(values_list)]
    elif (kind=='drop') & (type=='column'):
        df = df.loc[:, ~df.columns.isin(values_list)]

    return df

In [25]:
def transform_column_move(data, column, kind='end', insert_before=None):
    
    df = data.copy()
    
    if kind == 'end':
        cols = df.columns.tolist()
        cols.append(cols.pop(cols.index(column)))
        df = df[cols].copy()
        
    if kind == 'start':
        col = df.pop(column)
        df.insert(0, column, col)
        
    if kind == 'index':
        col = df.pop(column)
        idx = df.columns.get_loc(insert_before)
        df.insert(idx, column, col)
        
    return df

In [26]:
def transform_ckeck_index_column(data, column, contains_list, replace_values_list):
    df = data.copy()
    for i, j in zip(contains_list, replace_values_list):
        df[column] = df[column].apply(
            lambda x: transform_string_replace_value_if_contains(
                x, i, j, full_list=True))
    return df

In [27]:
def transform_normalize(data, column, normalize_dict):
    df = data.copy()
    for key, value in normalize_dict.items():
        df[column] = np.where(
            df[column].str.contains(key, case=True), value, df[column])
    return df

In [28]:
def add_leading_zeros_to_str(
        x, zeros=1, length_less_than=False, length_equals=False, all_values=False):
    
    length_x = len(str(x))
    # if isinstance(x, int):
    if length_less_than:
        if length_x < length_less_than:
            try:
                x = str(x).zfill(zeros+length_x)
            except ValueError:
                pass
    if length_equals:
        if length_x == length_equals:
            try:
                x = str(x).zfill(zeros+length_x)
            except ValueError:
                pass
    if all_values:
        try:
            x = str(x).zfill(zeros+length_x)
        except ValueError:
            pass 
    return x

In [29]:
def transferts_various(col1, col2):
    if len(col1) == 2:
        return 'Всего'
    else:
        return col2

In [30]:
def normalize_oktmo(x):
    
    x = ''.join(x.split())
    if len(x) == 1:
        x = add_leading_zeros_to_str(x, 1, length_equals=1)
    if len(x) == 7:
        x = add_leading_zeros_to_str(x, 1, length_equals=7)
        return x[:2] + ' ' + x[2:5] + ' ' + x[5:]
    if len(x) == 8:
        return x[:2] + ' ' + x[2:5] + ' ' + x[5:]
        
    return x

In [31]:
def normalize_municipal_town(x):
    
    x = ''.join(x.split())
    if len(x) == 19:
        return x[:13] + ' ' + x[13:25]
    if len(x) == 15:
        return x[:9] + ' ' + x[9:15]
    else:
        return x

In [32]:
def normalize_oktmo_municip_town(data, regions, normalize_dict):
    
    df = data.copy()
    
    for region in regions:
        if region == 'Ненецкий автономный округ (Архангельская область)':
            cond = ((df['index0'] == region) &
                    (df['ОКТМО'].str.count('') > 3))
            # indexes = df.loc[cond, :].index.tolist()
            # for index, value in zip()
            # df.loc
        else:
            cond = ((df['index0'] == region) &
            (df['ОКТМО'].str.count('0') > 4))
        indexes = df.loc[cond, :].index.tolist()
        for index, value in zip(indexes,  municipal_town_dict[region]):
            df.loc[index, 'ОКТМО'] = value

    return df

In [None]:
def transform_cost_of_living(
        col, year_name, cut_start, cut_end,
        drop_regions_list, replace_regions_dict):
    
    # get first two columns
    col = col[col.columns[:2]]
    # rename columns
    col_columns = ['Регион', year_name]
    col.columns = col_columns
    # cut rows
    col = transform_cut_rows(col, cut_start, cut_end)
    # remove spaces from both sides of region names
    col['Регион'] = col['Регион'].apply(lambda x: x.strip())
    # replace symbols
    col['Регион'] = col['Регион'].apply(lambda x: transform_replace(x, '1)', ''))
    col['Регион'] = col['Регион'].apply(lambda x: transform_replace(x, '2)', ''))
    col['Регион'] = col['Регион'].apply(lambda x: transform_replace(x, '3)', ''))
    col['Регион'] = col['Регион'].apply(lambda x: transform_replace(x, '4)', ''))
    col['Регион'] = col['Регион'].apply(lambda x: transform_replace(x, '5)', ''))
    # drop regions
    col = col[~col['Регион'].isin(drop_regions_list)].copy()
    # replace regions names
    col['Регион'] = col['Регион'].replace(replace_regions_dict)
    # set region as index
    col = col.set_index('Регион', drop=True)
    col.index.name = None

    return col

In [46]:
def transform_resources(
        data, year, FD_partial_names_list, federal_districts_names_list,
        drop_rows_end=None):
    
    df_raw = data.copy()
    # create slice to remove rows at the end of df
    if drop_rows_end is None:
        slice_ = slice(7, None)
    else:
        slice_ = slice(7, -drop_rows_end)
    # remove rows at the end of the df
    df = df_raw.iloc[:, :4][slice_].copy()
    df['Unnamed: 0'] = [i.strip() for i in df['Unnamed: 0']]
    df['Unnamed: 0'] = [i.strip() for i in df['Unnamed: 0']]
    # replace symbols
    replace_dict_part = {
        '/n': '',
        '\n': ' ',
        'Kемеровская область': 'Кемеровская область',
        'г. Москва': 'Москва',
        'г. Санкт-Петербург': 'Санкт-Петербург',
        'г. Севастополь': 'Севастополь'
    }
    df = df.replace(replace_dict_part, regex=True)
    replace_dict_full = {
        ' -': np.NaN,
        '-': np.NaN,
        '…': np.NaN
    }
    df = df.replace(replace_dict_full)
    # reset index
    df = df.reset_index(drop=True)
    # concatenate federal districts that names are separated in two rows
    df = federal_district_concat(df, 'Unnamed: 0', FD_partial_names_list)
    df = df[~df['Unnamed: 0'].isin(federal_districts_names_list)]
    
    df = df.rename(columns={
        'Unnamed: 0': year,
        'Unnamed: 1': 'Всего',
        'Unnamed: 2': 'Городская местность',
        'Unnamed: 3': 'Сельская местность'
    })
    # some clean
    df = df.set_index(year, drop=True)
    df.index.name = None
    # drop unuseful regions
    drop_list = [
    'в том числе:                     Ханты-Мансийский  автономный округ - Югра',
    'в том числе:                   Ненецкий автономный округ',
    'Ямало-Ненецкий  автономный округ',
    'Тюменская область',
    'Архангельская область',
    ]
    df = df.drop(drop_list, axis=0)
    # transform columns to values in two columns: 'variables' and 'values'
    # rename 'variables' to 'index' becausse it will be index level1
    # rename 'values' to year
    df = df.melt(
        var_name='index',
        value_name=year,
        ignore_index=False)
    # create multiindex
    df = df.set_index([df.index, 'index'], drop=True)
    # remove multiindex names
    df.index.names = (None, None)
    # change order if index level0 as in 'regions_names_list'
    # df = df.reindex(regions_names_list, level=0, axis=0)

    return df

In [None]:
def outward_axis(ax=None, x_offset=5, y_offset=5):

    if ax is None:
        ax = plt.gca()

    ax.spines['bottom'].set_position(('outward', x_offset))
    ax.spines['left'].set_position(('outward', y_offset))

In [None]:
def not_none(x):
    if x is not None:
        return True
    else:
        return False

In [2]:
def axis_rstyle(
        y_ticks: list | None = None,
        x_ticks: list | None = None,
        y_slice: list | None = None,
        x_slice: list | None = None,
        y_lim: list | None = None,
        x_lim: list | None = None,
        x_spine_lim: list | None = None,
        x_spine_hide: bool = False,
        y_spine_lim: list | None = None,
        y_spine_hide: bool = False,
        offset_left: float = 5,
        offset_bottom: float = 5,
        width: float = 0.75,
        margin: bool = True,
        customize_colors: bool = True,
        spines_color: str ='#AAAAAA',
        ticks_color: str ='#AAAAAA',
        ticklabels_color: str ='#909090',
        grid: bool = False,
        ax=None):
    
    '''
    x_ticks: tuple (x_min, x_max, step)
    y_ticks: tuple (y_min, y_max, step)

    Dependencies: 
        import: collections
        functions: arange
    '''
    
    if ax is None: ax = plt.gca()

    # order of steps (important):
        # 1 - get ticks
        # 2 - set margins if necessary
        # 3 - manipulations with sticks
        # 4 - update ticks
        # 5 - spines modification
        # 6 - set limits
        # 7 - tick params
        # 8 - grid

    # get ticks
    xticks = ax.get_xticks()
    yticks = ax.get_yticks()

    if margin is not None:
        if isinstance(margin, collections.abc.Iterable):
            ax.margins(*margin)
        else:
            margin = 0.01 if margin is True else margin
            # calculate margin coefficients coeff0 and coeff1 the way
            # margins have to be equal
            # 1st step: find size of figure/ax -> figisize (or ax) 
            # size should be like (ax_width, ax_height)
            # 2d step: suggest margin_x should be equals 0.025, then
                # ax_width * margin_x = ax_height * margin_y
                # margin_y = (margin_x * ax_width) / ax_height
            # so, calculated by this way values of margin_x and margin_y 
            # would make both margins equal and NOT depend on figure(or ax) size
            ax_height, ax_width = ax.bbox.height, ax.bbox.width
            margin_y = margin * ax_width / ax_height
            ax.margins(x=margin, y=margin_y)

    # declare xticks and yticks if necessary
    if x_ticks is not None:
        # if step not specified
        if len(x_ticks) == 2:
            x_step = xticks[1] - xticks[0]
            x_ticks = np.append(x_ticks, x_step)
        xticks = arange(x_ticks[0], x_ticks[1], x_ticks[2], True)
    if y_ticks is not None:
        # if step not specified
        if len(y_ticks) == 2:
            y_step = yticks[1] - yticks[0]
            y_ticks = np.append(y_ticks, y_step)
        yticks = arange(y_ticks[0], y_ticks[1], y_ticks[2], True)

    # declare xticks and yticks with slices if necessary
    if x_slice is not None:
        x_slice_ = slice(*x_slice)
        xticks = xticks[x_slice_]
    if y_slice is not None:
        y_slice_ = slice(*y_slice)
        yticks = yticks[y_slice_]

    # update ticks
    ax.set_xticks(xticks)
    ax.set_yticks(yticks)

    # customize spines
    ax.spines['bottom'].set_bounds(xticks[0], xticks[-1])
    ax.spines['bottom'].set_position(('outward', offset_bottom))
    ax.spines['left'].set_bounds(yticks[0], yticks[-1])
    ax.spines['left'].set_position(('outward', offset_left))

    if x_spine_lim:
        ax.spines['bottom'].set_bounds(x_spine_lim[0], x_spine_lim[-1])
    if y_spine_lim:
        ax.spines['left'].set_bounds(y_spine_lim[0], y_spine_lim[-1])

    if x_spine_hide:
        ax.spines['bottom'].set_visible(False)
    if y_spine_hide:
        ax.spines['left'].set_visible(False)

    if customize_colors:
        ax.spines['bottom'].set_color(spines_color)
        ax.spines['left'].set_color(spines_color)
        ax.tick_params(which='both', color=ticks_color)
        ax.tick_params( which='both', labelcolor=ticklabels_color)

    if width:
        ax.spines['bottom'].set_linewidth(width)
        ax.spines['left'].set_linewidth(width)
        ax.tick_params(which='both', width=width)

    # set limits if necessary
    if x_lim is not None:
        ax.set_xlim(x_lim[0], x_lim[1])
    if y_lim is not None:
        ax.set_ylim(y_lim[0], y_lim[1])
    
    # set tick params and colors
    ax.tick_params(
        which='both', direction='out', bottom=True, size=3, left=True)

    # grid customization (exclude grid lines at the edge of spines)
    if grid:
        if not isinstance(grid, bool):
            raise TypeError ("'grid' agrument must be Bool")
            
        ax.grid(False)
        x_ticks_ = ax.get_xticks()
        y_ticks_ = ax.get_yticks()

        for i in x_ticks_:
            if (i == x_ticks_[0]) | (i == x_ticks_[-1]):
                pass
            else:
                ax.plot(
                    [i, i], [y_ticks_[0], y_ticks_[-1]],
                    lw=0.5, ls=':', color='#D9D9D9')
        for i in y_ticks_:
            if (i == y_ticks_[0]) | (i == y_ticks_[-1]):
                pass
            else:
                ax.plot(
                    [x_ticks_[0], x_ticks_[-1]], [i, i],
                    lw=0.5, ls=':', color='#D9D9D9')
    else:
        ax.grid(False)

In [None]:
def spines_adjust_barplot(axis='x', ax=None):
    
    if ax is None: ax = plt.gca()
        
    if axis == 'x':
        ax.spines['bottom'].set_bounds(
            ax.patches[0].get_x(),
            ax.patches[-1].get_x() + ax.patches[-1].get_width())
        
    if axis == 'y':
        ax.spines['left'].set_bounds(
            ax.patches[0].get_y(),
            ax.patches[-1].get_y() + ax.patches[-1].get_height())