In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import r2_score


from plotly import express as px
import plotly.graph_objects as go

In [581]:
def find_nearest_index(array, value):
    array = np.asarray(array[~np.isnan(array)])
    idx = (np.abs(array - value)).argmin()
    return idx

def draw_PSM(df, 
             raw_data=True, 
             NMS=False, 
             probabilities=[0.75, 0.25, 0.1, 0, 0], 
             range_x='crop', 
             save=False,
             drop_bad_respondents=True,
             approximation='linear',
             return_intersection_points=True,
             max_degree=50,
             max_r2=0.99
            ):
    
    # calculate PSM
    if not raw_data:
        NMS = False
        
    if not NMS:
        df = df[['CHEAP', 'NORM', 'EXP', 'TOO_EXP']]
    
    if raw_data:
        
        if drop_bad_respondents:
            mask = np.where((df['CHEAP'] < df['NORM']) &\
                            (df['NORM'] < df['EXP']) &\
                            (df['EXP'] < df['TOO_EXP']), True, False)
            df = df[mask]
        
        if NMS:
            df_nms = df[['CHEAP_PROBA', 'EXP_PROBA']].copy()
            df = df[['CHEAP', 'NORM', 'EXP', 'TOO_EXP']]
            # TODO
        
        all_values = pd.unique(df.values.ravel('K'))
        result_df = pd.DataFrame(index=sorted(all_values))
        for column in df.columns:
            result_df[column] = df[column].value_counts().reindex(result_df.index, fill_value=0)
        df = result_df.copy()
    
    df = df / df.sum()
    df.columns = ['Слишком дешево', 'Дешево', 'Дорого', 'Слишком дорого']
    df.iloc[:, :2] = 1 - df.iloc[:, :2].cumsum()
    df.iloc[:, 2:] = df.iloc[:, 2:].cumsum()
    df = df * 100
    
    # crop range according to quantiles
    if range_x == None:
        range_x = [result_df.index[0], result_df.index[-1]]
    if range_x == 'crop':
        sum_res = result_df.sum(axis=1)
        extended = sum_res.index.repeat(sum_res.values)
        range_x = [np.quantile(extended, 0.1), np.quantile(extended, 0.9)]
    
    # polynomial approximation
    if approximation == 'polynomial':
        warnings.filterwarnings('ignore')
        x = np.array(df.index)

        best_degree = 1
        best_r2 = 0
        best_fit = None

        arrs = [None] * 4
        for c in range(4):
            
            y = df.iloc[:, c]

            for i in range(2, max_degree):
                
                try:
                    coefficients = np.polyfit(x, y, i)
                    polynomial = np.poly1d(coefficients)
                    y_pred = polynomial(x)
                    r_squared = r2_score(y, y_pred)

                    if r_squared > best_r2:
                        best_r2 = r_squared
                        best_degree = i
                        best_fit = y_pred

                    if r_squared > max_r2:
                        continue
                                            
                except:
                    continue
            
            arrs[c] = best_fit
        df_approx = pd.DataFrame(arrs).T
        df_approx.columns = df.columns
        df_approx.index = x
        
    if return_intersection_points:
        
        if approximation == 'polynomial':
            
            cheap_exp = df_approx.iloc[:, 0] - df_approx.iloc[:, 2]
            cheap_tooexp = df_approx.iloc[:, 0] - df_approx.iloc[:, 3]
            norm_exp = df_approx.iloc[:, 1] - df_approx.iloc[:, 2]
            norm_tooexp = df_approx.iloc[:, 1] - df_approx.iloc[:, 3]
            
            PMC = cheap_exp.index[find_nearest_index(cheap_exp, 0)]
            OPP = cheap_tooexp.index[find_nearest_index(cheap_tooexp, 0)]
            IDPP = norm_exp.index[find_nearest_index(norm_exp, 0)]
            PME = norm_tooexp.index[find_nearest_index(norm_tooexp, 0)]
            RANGE = PME - PMC
            
        elif approximation == 'linear':
            PMC = 0
            OPP = 0
            IDPP = 0
            PME = 0
            RANGE = PME - PMC
            
        else:
            raise ValueError("Specify approximation type (possible values: 'linear', 'polynomial')")

    # draw plots
    fig = px.line(df, range_x=range_x)
    fig.update_layout(
        title='',
        xaxis_title='Цена',
        yaxis_title='Доля, %',
        legend_title='',
        font=dict(
            family="Gill Sans Nova",
            size=18,
            color="Black"
        ),
        template='plotly_white',
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False)
    )
    
    fig.show()
    
    if approximation == 'polynomial':
        fig_approx = px.line(df_approx, range_x=range_x)
        fig_approx.update_layout(
                    title='',
                    xaxis_title='Цена',
                    yaxis_title='Доля, %',
                    legend_title='',
                    font=dict(
                        family="Gill Sans Nova",
                        size=18,
                        color="Black"
                    ),
                    template='plotly_white',
                    xaxis=dict(showgrid=False),
                    yaxis=dict(showgrid=False)
                )
        if return_intersection_points:
            scatter = pd.DataFrame(columns=['PMC', 'OPP', 'IDPP', 'PME'], index=[0, 1])
            scatter.loc[:, 'PMC'] = [df_approx.loc[PMC, 'Слишком дешево'], PMC]
            scatter.loc[:, 'OPP'] = [df_approx.loc[OPP, 'Слишком дешево'], OPP]
            scatter.loc[:, 'IDPP'] = [df_approx.loc[IDPP, 'Дешево'], IDPP]
            scatter.loc[:, 'PME'] = [df_approx.loc[PME, 'Дешево'], PME]
            scatter_trace = go.Scatter(x=scatter.iloc[1, :], y=scatter.iloc[0, :], 
                                       mode='markers+text', 
                                       name='Price points', marker={'color': 'black', 'size':7},
                                       text=scatter.columns, textposition='top center'
                                      )
            fig_approx.add_trace(scatter_trace)

        fig_approx.show()
    
    if save:
        if approximation == 'polynomial':
            fig_approx.write_image('PSM_approx.svg', format='svg')
        fig.write_image('PSM.svg', format='svg')
        
    if return_intersection_points or approximation == 'polynomial':
        if approximation == 'polynomial':
            return df, df_approx, (PMC, OPP, IDPP, PME), RANGE
        return df, (PMC, OPP, IDPP, PME), RANGE
    return df

In [582]:
df = pd.read_excel('CON_2462_Водка_Руст_weight_PSM.xlsx')

In [583]:
res = draw_PSM(df, approximation='polynomial', save=True)