In [None]:
def arange(arg1, arg2=None, arg3=None, arg4=None):
    
    '''
    default:
        arg1 - start
        arg2 - stop
        arg3 - step
        arg4 - endpoint (True: includes, False: not includes)

    variations:
        arange(arg1) -> range(start=0, stop=arg1, step=1, endpoint=False)
        
        arange(arg1, arg2):
            arange(num, num) -> (start=arg1, stop=arg2, step=1, endpoint=False)
            arange(num, bool) -> range(start=0, stop=arg1, step=1, endpoint=arg2)
            
            
        arange(arg1, arg2, arg3):
            arange(num, num, num) -> (start=arg1, stop=arg2, step=arg3, endpoint=False)
            arange(num, num, bool) -> range(start=arg1, stop=arg2, step=1, endpoint=arg3)
            
        arange(arg1, arg2, arg3, arg4) -> range(start=arg1, stop=arg2, step=arg3, endpoint=arg4)
    '''

    is_int = False

    # if only one argument: arange(arg1)
    if ((arg1 is not None) & (arg2 is None) &
        (arg3 is None) & (arg4 is None)):
        # equivalent (start=0, stop=arg1, step=1, endpoint=False)
        start = 0
        stop = arg1
        step = 1
        endpoint = False
        
        if isinstance(arg1, int):
            is_int = True

    # if two arguments: arange(arg1, arg2)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is None) & (arg4 is None)):
        
        # if second argument boolean: arange(number1, True)
        if isinstance(arg2, bool):
            # equivalent (start=0, stop=arg1, step=1, endpoint=arg2)
            start = 0
            stop = arg1
            step = 1
            endpoint = arg2
        # if second argument not boolean: arange(number1, number2)
        else:
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=False)
            start = arg1
            stop = arg2
            step = 1
            endpoint = False

        if isinstance(arg1, int) & isinstance(arg2, int):
            is_int = True

    # if three arguments: arange(arg1, arg2, arg3)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is None)):
        # if third argument boolean: arange(number1, number2, True)
        if isinstance(arg3, bool):
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=arg3)
            start = arg1
            stop = arg2
            step = 1
            endpoint = arg3
        # if third argument not boolean: arange(number1, number2, number3)
        else:
            # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=False)
            start = arg1
            stop = arg2
            step = arg3
            endpoint = False

        if (isinstance(arg1, int) & isinstance(arg2, int) &
               isinstance(arg3, int)):
            is_int = True

    # if all arguments: arange(arg1, arg2, arg4, True)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is not None)):
        # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=arg4)
        start = arg1
        stop = arg2
        step = arg3
        endpoint = arg4

        if (isinstance(arg1, int) & isinstance(arg2, int) &
            isinstance(arg3, int)):
            is_int = True

    # safe form of np.arange(start, stop, step)
    arr = step * np.arange(start/step, stop/step)
    # if last value of arr equals to stop it concatenates to arr
    if endpoint and arr[-1]+step==stop:
        arr = np.concatenate([arr,[stop]])

    if is_int:
        arr = arr.astype(int)
        
    return arr

In [None]:
# smooth data

def smoothed(x, y=None, n=300, k=3, return_type='df', datetime_index=False):
    '''
    Smooth data for plots
    
    Arguments:
    x: pd.DataFrame, pd.Series or array-type
    y: array-type
    n: length of linespace
    k: smoothing scale
    return_type: 
        - if 'array' - return x_new, y_new
        - if 'dict' - returns dict with {'x': x_new, 'y': y_new}

    If x == pd.DataFrame functon returns pd.DataFrame anyway

    Libraries:
    from scipy.interpolate import make_interp_spline, BSpline
    '''

    if datetime_index:
        start = x.index[0]
        end = x.index[-1]
        time_range = \
            pd.date_range(start=start, end=end, periods=n)
        x = x.reset_index(drop=True)

    if isinstance(x, pd.DataFrame):
        x_index = x.index
        x_new = np.linspace(x_index.min(), x_index.max(), n)
        df = pd.DataFrame(index=x_new, columns=x.columns)
        for col in x.columns:
            y = x[col]
            spl = make_interp_spline(x_index, y, k=k)  # type: BSpline
            y_new = spl(x_new)
            df[col] = y_new
        # df.index = x_new
        if return_type == 'df':
            if datetime_index:
                df = pd.DataFrame(data=y_new, index=time_range)
            else:
                df = pd.DataFrame(data=y_new, index=x_new)
            return df
        if return_type == 'array':
            return np.array(df.index), np.array(df.iloc[:, 0])
        
    else:
        if isinstance(x, pd.Series):
            y = x.copy()
            x = x.index
        
        # n represents number of points to make between T.min and T.max
        x_new = np.linspace(x.min(), x.max(), n) 
    
        spl = make_interp_spline(x, y, k=k)  # type: BSpline
        y_new = spl(x_new)
    
        if return_type == 'dict':
            if datetime_index:
                ret_dict = {
                    'x': time_range,
                    'y': y_new
                    }
            else:
                ret_dict = {
                    'x': x_new,
                    'y': y_new
                    }
            return ret_dict
        elif return_type == 'array':
            if datetime_index:
                return time_range, y_new
            else:
                return x_new, y_new
        elif return_type == 'df':
            if datetime_index:
                df = pd.DataFrame(data=y_new, index=time_range)
            else:
                df = pd.DataFrame(data=y_new, index=x_new)
            return df

In [None]:
# saturate and alpha colors

def saturate_color(color_rgb, saturation=0.75):
    color_hls = colorsys.rgb_to_hls(
        color_rgb[0], color_rgb[1], color_rgb[2])
    color_hls_saturated = (
        color_hls[0], color_hls[1], saturation*color_hls[2])
    color_rgb_saturated = colorsys.hls_to_rgb(
        color_hls_saturated[0], color_hls_saturated[1], color_hls_saturated[2])
    return color_rgb_saturated


def alpha_color(color, alpha):
    new_color = tuple (x + (1 - x) * (1 - alpha) for x in color)
    return new_color


def saturate_palette(palette, saturation=0.75):
    palette_saturated = [saturate_color(i, saturation=saturation) for i in palette]
    return palette_saturated


def alpha_palette(palette, alpha=0.90):
    palette_alphed = [alpha_color(i, alpha=alpha) for i in palette]
    return palette_alphed

In [None]:
# check if there are NaNs in df

def is_nan(df):
    ret = df[df.isna().any(axis=1)]
    shape = df[df.isna().any(axis=1)].shape
    if shape[0] > 0:
        return ret
    else:
        print("No NaN values in DataFrame")

In [None]:
# save-load

def loadit(name, dir='files'):
    result = pd.read_pickle(f'{dir}{name}.pkl')
    return result

def saveit(file, name, dir='files'):
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    # save file
    filehandler = open(f'{dir}/{name}.pkl', 'wb') 
    pickle.dump(file, filehandler)
    filehandler.close()
    print(f"File '{name}' saved in directory '{dir}'")

def savefig(name, dir='img', format='png', dpi=100, transparent=True,  figure=None, **kwargs):
    '''
    Saves figure as PNG to 'img/' dir
    '''
    if figure is None:
       figure = fig
    if dir is None:
        dir = 'img'
    else:
        if dir != 'img':
            dir = f'img/{dir}'
        else:
            pass
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    figure.savefig(
        f'{dir}/{name}.{format}',
        transparent=transparent,
        bbox_inches='tight',
        dpi=dpi, 
        format=format,
        **kwargs
    )
    print(f"Image '{name}.{format}' successfully saved into '{dir}' directory")

In [None]:
def ci_bootstrap(
        data, statistic=np.mean, n_bootstrap=1000,
        confidence_level=0.95, random_state=42):
    '''
    Returns: dict(statistic, std, ci_min, ci_max, margin)
    '''
    data_ = (data,)
    bootstrap = scipy.stats.bootstrap(
        data=data_,
        statistic=statistic,
        n_resamples=n_bootstrap,
        confidence_level=confidence_level,
        random_state=random_state
    )
    ci_min = bootstrap.confidence_interval[0]
    ci_max = bootstrap.confidence_interval[1]
    if isinstance(data, pd.DataFrame):
        stat = data.apply(statistic)
        stat = np.array(stat)
        std = np.array(np.std(data, ddof=1))
    else:
        stat = statistic(data)
        std = np.std(data, ddof=1)
    margin = stat - ci_min

    return_dct = {
        'statistic': stat,
        'std': std,
        'ci_min': ci_min,
        'ci_max': ci_max,
        'margin': margin,
    }
    return return_dct

In [None]:
def ci_t_distribution(
        data=None, mean=None, std=None, n=None, confidence_level=0.95):

    if data is not None:
        arr = np.array(data)
        n = len(arr)
        mean = np.mean(arr)
        se = scipy.stats.sem(arr)
        
    if mean and std and n is not None:
        se = std / np.sqrt(n)

    t = scipy.stats.t.ppf((1+confidence_level) / 2, n-1)
    margin = t * se
    ci_min = mean - margin
    ci_max = mean + margin

    return_dct = {
        'min': ci_min,
        'max': ci_max,
        'mean': mean,
        'margin': margin,
        't': t
    }
    return return_dct

In [None]:
# normality tests

def test_normality(data, alpha=0.05):
    
    tests_names = []
    pvalue = []
    condition = []
        
    # Kolmogorov-Smirnov
    ks = stats.kstest(data, 'norm')
    pvalue_ks = ks.pvalue
    tests_names.append('Kolmogorov-Smirnov')
    pvalue.append(pvalue_ks)
    if pvalue_ks < alpha:
        condition.append('Not normal')
    else:
        condition.append('Normal')

    # Anderson-Darling
    and_dar = stats.anderson(data, dist='norm')
    and_dar_sign = and_dar.critical_values[2]
    and_dar_statistic = and_dar.statistic
    tests_names.append('Anderson-Darling (s)')
    pvalue.append(and_dar_statistic)
    if and_dar_statistic > and_dar_sign:
        condition.append('Not normal')
    else:
        condition.append('Normal')

    # Shapiro-Wilk
    pvalue_sw = stats.shapiro(data).pvalue
    tests_names.append('Shapiro-Wilk')
    pvalue.append(pvalue_sw)
    if pvalue_sw < alpha:
        condition.append('Not normal')
    else:
        condition.append('Normal')

    # jarque-bera test
    jb_name = ["Jarque-Bera", "Chi^2", "Skew", "Kurtosis"]
    jb_statistic = sms.jarque_bera(data)
    jb = dict(zip(jb_name, jb_statistic))
    pvalue_jb = jb['Chi^2']
    tests_names.append('Jarque-Bera')
    pvalue.append(pvalue_jb)
    if pvalue_jb < alpha:
        condition.append('Not normal')
    else:
        condition.append('Normal')
    
    # D’Agostino and Pearson
    dagp = stats.normaltest(data)
    pvalue_dagp = dagp.pvalue
    tests_names.append('D’Agostino-Pearson')
    pvalue.append(pvalue_dagp)
    if pvalue_dagp < alpha:
        condition.append('Not normal')
    else:
        condition.append('Normal')

    pvalue = [np.round(i, 4) for i in pvalue]
    results_df = pd.DataFrame({
        'Test': tests_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
    })
    
    return results_df

In [None]:
def order_X_y(data, target):
    '''
    Move Target variable column to the end of DataFrame
    '''
    columns = data.columns.tolist()
    columns.append(columns.pop(columns.index(target)))
    df = data[columns].copy()
    
    return df

In [None]:
def feature_importance_display(
        features, importance,
        top=None, imp_min_level=None, only_features=True):

    '''
     
    '''

    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': importance
    })
    if imp_min_level is not None:
        loc_row = feature_importance['Importance'] > imp_min_level
        feature_importance = (feature_importance
                              .loc[loc_row, :]
                              .sort_values('Importance', ascending=False)
                              .reset_index(drop=True))
    if top is not None:
        feature_importance = (feature_importance
                             .sort_values('Importance', ascending=False)
                             .reset_index(drop=True))
        feature_importance = feature_importance.loc[0:top-1]

    if only_features:
        feature_importance = feature_importance['Feature']
        
    return feature_importance

In [None]:
def rgb_to_hex(x):
    color_hex = matplotlib.colors.to_hex(x)
    return color_hex

In [None]:
def outliers_column_iqr(data, feature, scale=1.5):

    '''
    Add nominative (1/0) column '{feature}_is_out' in DataFrame, that indicates outliers for Feature
    '''

    df = data.copy()

    q1 = df[feature].quantile(0.25)
    q3 = df[feature].quantile(0.75)
    iqr = q3 - q1
    lower_boundary = q1 - scale*iqr
    upper_boundary = q3 + scale*iqr
    condition = ((df[feature] < lower_boundary) |
                 (df[feature] > upper_boundary))
    df[feature+'_is_out'] = condition.astype(int)

    return df

In [None]:
def correlation_w_target(data, target):

    '''
    Create sorted DataFrame with correlations to Target 
    '''
    
    df = (data
          .corr()[target]
          .sort_values(ascending=False, key=abs)[1:]
          .to_frame())
    return df

In [None]:
def check_columns_match(data):

    '''
    Check if all columns in DataFrame are equla and return no equal if not
    '''

    df = data.copy()
    df['is_equal'] = df.eq(df.iloc[:, 0], axis=0).all(1).astype(int)
    equal_sum = df['is_equal'].sum()

    if equal_sum == len(df):
        print('All values matched')
        return None
    else:
        loc = df['is_equal'] == 0, df.columns != 'is_equal'
        result = df.loc[loc].copy()
        return result      

In [None]:
def fillna_na(data, features_list):

    '''
    Fill all NaNs in DataFrame by 'NA'
    '''

    df = data.copy()
    for feature in features_list:
        df[feature] = df[feature].fillna('NA')

    return df

In [None]:
def is_equal(data1, data2):

    if data1.equals(data2):
        print('Equal')
    else:
        # display rows with differences
        data1[~data1.apply(tuple, 1).isin(data2.apply(tuple, 1))]

In [None]:
def remove_duplicated_whitespaces(x):

    '''
    Remove duplicated whitespaces in x (String variable)
    '''
    
    return str.join(' ', str(x).split())

In [None]:
def replace_with_dict(x, replace_dict):

    '''
    In argument 'x' replaces all replace_dict keys by replace_dict values
    '''
    
    for key in replace_dict.keys():
        x = x.replace(key, replace_dict[key])
        
    return x

In [None]:
def df_cutted_rows(data, start, end):
    '''
    Cut n=='start' rows at the beginning of DataFrame and 
    n=='end' rows at the end of DataFrame 
    '''
    if end == 0:
        slice_ = (slice(start, None), slice(None, None))
    else:
        # create slice, that cut rows and stay all columns
        slice_ = (slice(start, -end), slice(None, None))
    # unpack slice_ in .iloc
    df = data.iloc[*slice_].copy()

    return df

In [None]:
def last_row_to_first(data):

    '''
    Make the last row of DataFrame to be the first
    '''

    df = data.copy()
    # extract last row with 'Год' from 'pci_month'
    first_row = df.iloc[-1].to_frame().T
    # add it as first row to 'pci_month'
    df = pd.concat([first_row, df], axis=0)
    # remove last row from 'pci_month'
    df = df.iloc[:-1].copy()

    return df

In [None]:
def np_index(array, value):
    '''
    Returns index of Value which is in Array
    '''
    return np.where(array == value)[0][0]

In [None]:
def normalized_by_first(data, return_type='df'):

    '''
    Normalize kind: 
        first_value == first_value
        second_value = second_value / first_value
        third_value = third_value / first_value
    '''
    
    first_value = data[0]
    
    data_new = [(x/first_value) for x in data]

    if return_type == 'df':
        df = pd.DataFrame(data=data_new, index=data.index)
        return df
    if return_type == 'series':
        series = pd.Series(data=data_new, index=data.index)
        return series
    elif return_type == 'array':
        array = np.array(data_new)
        return array
    elif return_type == 'list':
        lst = list(data_new)
        return lst
    else:
        print("'return_type' must be 'df', 'series', 'array', 'list'")
    
    return data_new

In [None]:
def normalized(data, reshape=True, return_type='df'):

    '''
    MinMaxScaler 0/1 
    '''
    
    if (isinstance(data, pd.Series) | 
        isinstance(data, pd.DataFrame)):
        idxs = data.index.copy()
    if reshape:
        data = np.array(data).reshape(-1, 1)
    data_new = MinMaxScaler().fit_transform(data)
    if return_type == 'df':
        data_new = pd.DataFrame(data=data_new, index=idxs)
    elif return_type == 'array':
        pass
    else:
        print("return_type must be 'df' or 'array'")
        return None
        
    return data_new

In [None]:
def to_date(x, kind='%B %Y', translate=False):
    '''
    String to Date
    '''
    months_list = [
        'январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль',
        'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь',
        'Январь', 'Февраль', 'Март', 'Апрель', 'Май', 'Июнь', 'Июль',
        'Август', 'Сентябрь', 'Октябрь', 'Ноябрь', 'Декабрь'
    ]
    # if months in Russian
    if translate:
        # split string to list
        x = x.split()
        # for every element in list
        for i in x:
            # if element is month
            if i in months_list:
                # find its index
                i_index = x.index(i)
                # translate element and access new value with it
                new_value = months_translate(i, kind='rus-eng', capitalize=True)
                # change old month to new one
                x[i_index] = new_value
        # join all elements of list to one string
        x = ' '.join(x)
    # transform string to date
    x = dt.datetime.strptime(x, kind)
    return x

In [None]:
def months_translate(x, kind='rus-eng', add_year=None, capitalize=True):

    '''
    Transform russian month name to english
    'январь' --> 'January'
    
    if add_year==2021: 'январь' --> 'January 2021'
    if capitalize==False: 'январь' --> 'january'
    '''
    
    # lowercase data
    x_old = x.lower()
    # create repalce dict
    if kind == 'rus-eng':
        repalce_dict = {
            'январь': 'january',
            'февраль': 'february',
            'март': 'march',
            'апрель': 'april',
            'май': 'may',
            'июнь': 'june',
            'июль': 'july',
            'август': 'august',
            'сентябрь': 'september',
            'октябрь': 'october',
            'ноябрь': 'november',
            'декабрь': 'december'
        }
    elif kind == 'eng-rus':
        repalce_dict = {
            'january': 'январь',
            'february': 'февраль',
            'march': 'март',
            'april': 'апрель',
            'may': 'май',
            'june': 'июнь',
            'july': 'июль',
            'august': 'август',
            'september': 'сентябрь',
            'october': 'октябрь',
            'november': 'ноябрь',
            'december': 'декабрь'
        }
    else:
        print("'kind' must be 'rus-eng' or 'eng-rus'")
    # for all keys and values in dict, replace x by value if x and key are equal
    for k, v in repalce_dict.items():
        if x_old == k:
            x_new = v
        else:
            pass

    if capitalize:
        x_new = x_new.capitalize()

    if add_year is not None:
        x_new = x_new + ' ' + str(add_year)

    return x_new

In [None]:
def axis_new_year(
        months=[1, 4, 7, 10],
        month_format='%b',
        year_format='%Y',
        add_year_axis=True,
        year_axis_pad=-0.105,
        language='eng',
        months_as_minor=False,
        months_pad=5,
        capitalize=True,
        ax=None):

    '''
    /// IMPORTANT: If use language=='rus' then use set_location('EN') after plt.show() or reset_location=True
        if current ax is last in figure, 
        because axis_new_year() function changes location to 'ru_RU'

    /// Also, 'language' argument, that've set in last ax, aplied to all axes of plot.  
    
    Modificate date format of plots from datetime (for example, '2021-01-01') to:
    
    ---|-------|----- ... ---|--------|-------
      Jan     Feb           Dec      Jan      
      2021                           2022         
    '''
    
    # set ax
    if ax is None: ax = plt.gca()

    # specify 1st month for major ticks and other months for minor ticks
    if months_as_minor:
        # minor months - all except 1
        months_minor = list(filter(lambda i: i != 1, months))
        months_major = 1
        loc_month_minor = mdates.MonthLocator(bymonth=months_minor)
    else:
        months_major = months

    # major ticks
    loc_month_major = mdates.MonthLocator(bymonth=months_major)
    # set format of months labels
    fmt_month = mdates.DateFormatter(month_format)
    # major ticks every year
    loc_year = mdates.YearLocator()
    # set format of year labels
    fmt_year = mdates.DateFormatter(year_format)
    
    # set month major ticks
    ax.xaxis.set_major_locator(loc_month_major)
    ax.xaxis.set_major_formatter(fmt_month)
    # set month minor ticks if necessary
    if months_as_minor:
        ax.xaxis.set_minor_locator(loc_month_minor)
        ax.xaxis.set_minor_formatter(fmt_month)
        ax.tick_params(axis='x', which='minor', pad=months_pad)

    # set secondary axis with major ticks as year
    if add_year_axis:
        second_xaxis = ax.secondary_xaxis(year_axis_pad)
        second_xaxis.xaxis.set_major_locator(loc_year)
        second_xaxis.xaxis.set_major_formatter(fmt_year)

    # hide the second x-axis spines and ticks
    second_xaxis.spines['bottom'].set_visible(False)
    second_xaxis.tick_params(bottom=False)

    # translate months if necessary
    if language=='eng':
        locale.setlocale(locale.LC_ALL,'en_US')
    elif language=='rus':
        locale.setlocale(locale.LC_ALL,'ru_RU.UTF-8')
    else:
        print("'language' have to be 'eng' or 'rus'")

    # capialize months if necessary
    if capitalize:
        function = lambda x,pos: mdates.DateFormatter(month_format)(x,pos).capitalize()
        ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(function))
        ax.xaxis.set_minor_formatter(matplotlib.ticker.FuncFormatter(function))

In [None]:
def set_location(loc='EN'):
    if loc=='EN':
        locale.setlocale(locale.LC_ALL,'en_US')
    elif loc=='RU':
        locale.setlocale(locale.LC_ALL,'ru_RU.UTF-8')
    else:
        print("Location have to be 'EN' or 'RU'")

In [None]:
def customize_axis(
        ax=None, x_offset=10, y_offset=10, ticks_width=1, axis_width=1,
        ticks_color='0.9', label_color='0.5', axis_color=None, **kwargs):

    if ax is None:
        ax = plt.gca()

    ax.tick_params(width=ticks_width, colors=ticks_color, labelcolor=label_color)

    ax.spines['bottom'].set_position(('outward', x_offset))
    ax.spines['left'].set_position(('outward', y_offset))

    ax.spines['bottom'].set_linewidth(axis_width)
    ax.spines['left'].set_linewidth(axis_width)

    if axis_color is not None:
        ax.spines['bottom'].set_color(axis_color)
        ax.spines['left'].set_color(axis_color)

In [None]:
def outward_axis(ax=None, x_offset=5, y_offset=5):

    if ax is None:
        ax = plt.gca()

    ax.spines['bottom'].set_position(('outward', x_offset))
    ax.spines['left'].set_position(('outward', y_offset))

In [None]:
def axis_rstyle(
        y_ticks=None,
        x_ticks=None,
        y_slice=None,
        x_slice=None,
        y_lim=None,
        x_lim=None,
        offset_left=5,
        offset_bottom=5,
        width=0.75,
        margin=True,
        spines_color=None,
        ticks_color=None,
        ticklabels_color=None,
        grid=False,
        ax=None):
    
    '''
    x_ticks: tuple (x_min, x_max, step)
    y_ticks: tuple (y_min, y_max, step)

    Dependencies: 
        import: collections
        functions: arange
    '''
    
    if ax is None: ax = plt.gca()

    # order of steps is important:
        # 1 - get ticks
        # 2 - set margins if necessary
        # 3 - manipulations with sticks
        # 4 - update ticks
        # 5 - spines modification
        # 6 - set limits
        # 7 - tick params
        # 8 - grid

    # get ticks
    xticks = ax.get_xticks()
    yticks = ax.get_yticks()

    if margin is not None:
        if isinstance(margin, collections.abc.Iterable):
            ax.margins(*margin)
        else:
            margin = 0.01 if margin is True else margin
            # calculate margin coefficients coeff0 and coeff1 the way
            # margins have to be equal
            # 1st step: find size of figure/ax -> figisize (or ax) 
            # size should be like (ax_width, ax_height)
            # 2d step: suggest margin_x should be equals 0.025, then
                # ax_width * margin_x = ax_height * margin_y
                # margin_y = (margin_x * ax_width) / ax_height
            # so, calculated by this way values of margin_x and margin_y 
            # would make both margins equal and NOT depend on figure(or ax) size
            ax_height, ax_width = ax.bbox.height, ax.bbox.width
            margin_y = margin * ax_width / ax_height
            ax.margins(x=margin, y=margin_y)

    # declare xticks and yticks if necessary
    if x_ticks is not None:
        # if step not specified
        if len(x_ticks) == 2:
            x_step = xticks[1] - xticks[0]
            x_ticks = np.append(x_ticks, x_step)
        xticks = arange(x_ticks[0], x_ticks[1], x_ticks[2], True)
    if y_ticks is not None:
        # if step not specified
        if len(y_ticks) == 2:
            y_step = yticks[1] - yticks[0]
            y_ticks = np.append(y_ticks, y_step)
        yticks = arange(y_ticks[0], y_ticks[1], y_ticks[2], True)

    # declare xticks and yticks with slices if necessary
    if x_slice is not None:
        x_slice_ = slice(*x_slice)
        xticks = xticks[x_slice_]
    if y_slice is not None:
        y_slice_ = slice(*y_slice)
        yticks = yticks[y_slice_]

    # update ticks
    ax.set_xticks(xticks)
    ax.set_yticks(yticks)

    # customie spines
    ax.spines['bottom'].set_bounds(xticks[0], xticks[-1])
    ax.spines['bottom'].set_position(('outward', offset_bottom))
    ax.spines['left'].set_bounds(yticks[0], yticks[-1])
    ax.spines['left'].set_position(('outward', offset_left))

    if spines_color is not None:
        ax.spines['bottom'].set_color(spines_color)
        ax.spines['left'].set_color(spines_color)
    if ticks_color is not None:
        ax.tick_params(
            which='both',
            color=ticks_color)
    if ticklabels_color is not None:
        ax.tick_params(
            which='both',
            labelcolor=ticklabels_color)

    if width:
        ax.spines['bottom'].set_linewidth(width)
        ax.spines['left'].set_linewidth(width)
        ax.tick_params(which='both', width=width)

    # set limits if necessary
    if x_lim is not None:
        ax.set_xlim(x_lim[0], x_lim[1])
    if y_lim is not None:
        ax.set_ylim(y_lim[0], y_lim[1])
    
    # set tick params and colors
    ax.tick_params(
        which='both', direction='out', bottom=True, left=True)

    # turn off grid
    if not grid:
        ax.grid(False)

In [None]:
def to_round(x, scale=1, error='skip'):
    
    '''
    Round x if possible
    '''
    try:
        return round(x, ndigits=scale)
    except TypeError:
        if error == 'type':
            print(f'TypeError: {x}')
        elif error == 'skip':
            pass
        else:
            print("'error' must be 'type' or 'skip'")
            return
        return x

In [None]:
def to_float(x):
    '''
    Convert x to Float if possible
    '''
    try:
        return float(x)
    except ValueError:
        print(f'ValueError: {x}')
        return x

In [None]:
def to_int(x, errors=False):
    '''
    Convert x to Int if possible
    '''
    try:
        return int(x)
    except ValueError:
        return x
        if errors:
            print(f'ValueError: {x}')
    except TypeError:
        return x
        if errors:
            print(f'ValueError: {x}')

In [None]:
def to_string(x):
    '''
    Convert x to String if possible
    '''
    try:
        return str(x)
    except ValueError:
        print(f'ValueError: {x}')
        return x
    except TypeError:
        print(f'ValueError: {x}')
        return x

In [None]:
def skewness(df):

    df = pd.DataFrame(df.skew(numeric_only=True),
                      columns=['Skewness'],
                      index=None)

    df['Highly skewed'] = (abs(df['Skewness']) > 0.5)
    df['abs'] = abs(df['Skewness'])

    df = df.sort_values(by=['abs', 'Highly skewed'], ascending=False)
    df = df.drop('abs', axis=1)

    return df

In [None]:
def kurtosis(df):

    df = pd.DataFrame(df.kurtosis(numeric_only=True),
                      columns=['Kurtosis'],
                      index=None)
    df['Type'] = np.nan

    df.loc[df['Kurtosis'] > 1, 'Type'] = 'Too Peaked'
    df.loc[df['Kurtosis'] < -1, 'Type'] = 'Too Flat'
    df.loc[(df['Kurtosis'] <= 1) & (df['Kurtosis'] >= -1), 'Type'] = 'Normal'
    
    df['abs'] = abs(df['Kurtosis'])
    df = df.sort_values(by=['abs', 'Type'], ascending=False)
    df = df.drop('abs', axis=1)

    return df

In [None]:
def not_none(x):
    if x is not None:
        return True
    else:
        return False

In [None]:
def axis_add_xaxis(
            ticks,
            labels,
            width=0.5,
            offset=0,
            offset_first_axis=5,
            color_labels=None,
            color_ticks=None,
            ax=None):

    '''
    
    '''
    
    if ax is None: ax = plt.gca()

    ax_xticks = ax.get_xticks()
    ax_xticks_length = len(ax_xticks)
    ax_xticks_min, ax_xticks_max = ax_xticks[0], ax_xticks[-1]
    ax_xticks_lim = ax.get_xlim()

    for n, l, idx in zip(ticks, labels, range(len(ticks))):
        
        if n < ax_xticks_length:
            n_corrected = (2*n-1) + 2
            l = list_add_after_every(l, ' ', 1)
            l = [''] + l
        else:
            n_corrected = n

        t = np.linspace(ax_xticks_min, ax_xticks_max, n_corrected)

        # axes for labels
        ax_labels = ax.twiny()
        ax_labels.grid(False)
        ax_labels.set_xticks(ticks=t, labels=l)
        
        if idx == 0:
            ax_labels.set_xlim(ax_xticks_lim)

        ax_labels.spines[['top', 'left', 'bottom', 'right']].set_visible(False)
        ax_labels.spines['bottom'].set_position(('outward', offset))
        
        ax_labels.tick_params(
            axis='x', direction='out',
            top=False, left=False, right=False, bottom=False,
            labeltop=False, labelbottom=True, labelleft=False, labelright=False)
        if color_labels:
            ax_labels.tick_params(axis='x', labelcolor=color_labels)

        # axes for ticks
        ax_ticks = ax.twiny()
        ax_ticks.grid(False)
        ax_ticks.set_xticks(t)

        if color_ticks:
            ax_ticks.tick_params(axis='x', color=color_ticks)
        
        if idx == 0:
            pad = offset_first_axis=5,
            ax_ticks.set_xlim(ax_xticks_lim)
            size = 3
        else:
            pad = offset + 5
            size = 10
            
        ax_ticks.tick_params(
            axis='x', direction='out', width=width, size=size,
            bottom=True, labelbottom=False, top=False, labeltop=False)

        ax_ticks.spines[['top', 'left', 'bottom', 'right']].set_visible(False)
        ax_ticks.spines['bottom'].set_position(('outward', pad))

        if idx != 0:
            # hide every second tick (not count first and last)
            for i in ax_ticks.xaxis.get_major_ticks()[1:-1][::2]:
                i.set_visible(False)
            ax_ticks.xaxis.get_major_ticks()[0].set_visible(False)
            ax_ticks.xaxis.get_major_ticks()[-1].set_visible(False)
        
        offset += 15

In [None]:
def put_column_after(data, column_to_move, column_insert_after):

    '''
    Moves 'column_to_move' from its position to the position after 'column_insert_after'

    Before:
     col1 | column_insert_after | col2 | col3 | col4 | column_to_move | col5
    -------------------------------------------------------------------------
    
    After:
     col1 | column_insert_after | column_to_move | col2 | col3 | col4 | col5
    -------------------------------------------------------------------------
    '''

    df = data.copy()
    
    col = df.pop(column_to_move)
    idx = df.columns.get_loc(column_insert_after) + 1
    df.insert(idx, column_to_move, col)

    return df

In [None]:
def data_describe(data):
    
    df = data.copy()
    # varibles types
    dtypes = df.dtypes.rename('Type').to_frame()
    # frequency
    frequency = df.count().rename('Count').to_frame()
    # unique values
    unique = df.nunique().rename('Unique').to_frame()
    # NaNs
    nans = df.isnull().sum().rename('NaN').to_frame()
    # NaNs fraction
    nans_frac = df.isnull().mean().round(2)
    nans_frac = nans_frac.rename('Percentages').to_frame()
    # list with results
    results_list = [dtypes, frequency, unique, nans, nans_frac]
    # df with results
    results = pd.concat(results_list, axis=1)
    results['Percentages'] = (results['Percentages'] * 100).astype('int64')
    results = results.sort_values(['NaN'], ascending=False)
    
    return results

In [None]:
def plot_gridplot(
        data, features, target=None, figsize=None, ncols=3, kind='reg',
        plot_shape='rectangle', markersize=15, hscale=1, pscale=1, regplot_kwargs={},
        pointplot_kwargs={}, scatterplot_kwargs={}, histplot_kwargs={}):

    nrows = math.ceil(len(features) / ncols)
    nplots = np.arange(1, len(features)+1)

    if plot_shape == 'square':
        whscale=(2,2)
    if plot_shape == 'rectangle':
        whscale=(4,2.5)

    if figsize is not None:
        figsize = figsize
    else:
        width = whscale[0] * ncols
        height = whscale[1] * nrows
        figsize_width = width * pscale
        figsize_height = height * pscale
        figsize = (figsize_width, figsize_height)
    
    fig = plt.figure(figsize=figsize)
    
    if kind == 'reg':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.regplot(
                data=data,
                x=feature,
                y=target,
                scatter_kws={
                    'ec': '#606060',
                    's': markersize,
                    'alpha': 0.9
                },
                **regplot_kwargs
            )
            plt.ylabel(None)

    if kind == 'point':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.pointplot(
                data=data,
                x=feature,
                y=target,
                markersize=markersize,
                linestyle='none',
                capsize=0.031,
                err_kws={'lw': 0.81*pscale},
                **pointplot_kwargs
            )
            plt.ylabel(None)
            plt.xticks(rotation=45)

    if kind == 'hist':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.histplot(
                data=data,
                x=feature,
                alpha=0.95,
                **histplot_kwargs
            )
            plt.ylabel(None)

    if kind == 'scatter':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.scatterplot(
                data=data,
                x=feature,
                y=target,
                s=markersize,
                **scatter_kwargs
            )
            plt.ylabel(None)
        
    plt.subplots_adjust(hspace=0.4*hscale)
    plt.show()
    return fig