In [1]:
import os
import warnings

import polars as pl
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from lets_plot import *
LetsPlot.setup_html()

In [2]:
def double_exp(x, a, b, c, d):
    return a * np.exp(-x * b) + (d) * np.exp(-x * c)


def powerlaw(x, a, b):
    return a * x ** (-b)


def exp_decay(x, a, b):
    return a * np.exp(-x * b)


def tri_exp(x, a, b, c, d, e, f):
    return a * np.exp(-x * b) + c * np.exp(-x * d) + e * np.exp(-x * f)


def quad_exp(x, a, b, c, d, e, f, g, h):
    return (
        a * np.exp(-x * b)
        + c * np.exp(-x * d)
        + e * np.exp(-x * f)
        + g * np.exp(-x * h)
    )


def penta_exp(x, a, b, c, d, e, f, g, h, i, j):
    return (
        a * np.exp(-x * b)
        + c * np.exp(-x * d)
        + e * np.exp(-x * f)
        + g * np.exp(-x * h)
        + i * np.exp(-x * j)
    )

def deleteNaN(y: np.ndarray,t: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    delete NaN parts of the input array and time array opened for it,
    and returns time array and values array.

    """

    t = t[~np.isnan(y)]
    y = y[~np.isnan(y)]

    return t, y

def value_fit(
    val: np.ndarray, t: np.ndarray, tmax: int, 
    eq: callable, sigma_w: bool = False,
    delete_nan: bool = True
) -> tuple[np.ndarray, np.ndarray, tuple]:
    """

    Parameters
    ----------
    val : np.ndarray
        Values 1d array to fit.
    eq : callable
        Equation to create a fit.

    Returns
    -------
    y_fit : np.ndarray
        1d Fitted values array.
    ss_res_norm : np.ndarray
        Sum of squares of residuals normalized.
    popt : tuple

    """
    t_range = np.arange(tmax) + 1

    if delete_nan:
        t, val = deleteNaN(val,t)

    if sigma_w:
        sigma = t.astype(float)**-10
        popt, _ = curve_fit(eq, t, val, maxfev=20000000, sigma=sigma)

    else:
        popt, _ = curve_fit(eq, t, val, maxfev=20000000)


    y_fit = eq(t_range, *popt)  # full time length
    y_fit[y_fit < 1] = np.nan  # too small values to be removed
    y_fit[y_fit > np.max(val) * 2] = np.nan  # too big values removed

    return y_fit



In [3]:

def arr_minimize(arr: np.ndarray, method: str = "median") -> np.ndarray:
    """
    Minimizes 1d array by removing repeats, according to the given method.

    Parameters
    ----------
    arr : np.ndarray
        1d array to be minimized.
    method : str, optional
        'median' or 'average'. The default is 'median'.

    Returns
    -------
    arr1 : np.ndarray
        minimized array.

    """

    search = np.unique(arr)  # arr of unique elements
    search = search[search > 0]  # remove nans

    arr1 = arr.copy()

    for s in search:
        (positions,) = np.where(arr == s)
        if method == "median":
            mid = int(np.median(positions))

        elif method == "average":
            mid = int(np.average(positions))
        elif method == "max":
            mid = int(np.max(positions))
        elif method == "min":
            mid = int(np.min(positions))

        arr1[positions] = np.nan
        arr1[mid] = s  # mid value is kept

    return arr1


def df_minimize(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to be minimized.

    Returns
    -------
    df : pd.DataFrame
        Minimized DataFrame.

    """
    for i in range(len(df.columns)):
        df.iloc[:, i] = arr_minimize(
            df.iloc[:, i], **kwargs
        )  # values minimized and returned

    return df

In [4]:
# values dataframe
df = pd.read_csv("./data/duration_cont.csv", index_col=None)
df = df_minimize(df, method="median")
df[df == 0] = np.nan
df = df.dropna(axis=0, how="all")
df["timestep"] = df.index + 1
df = pl.from_pandas(df)

In [5]:
df.tail()

uniform_250,uniform_350,gaus_250,gaus_350,3.50_60,timestep
f64,f64,f64,f64,f64,i64
,2.0,,,,1499
,,,2.0,,1605
,,,,1.0,1772
,1.0,,,,2038
,,,1.0,,2080


In [6]:
def sample_ends_favored_1D(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    """sample a dataframe but favor the ends

    Parameters
    ----------
    df : pd.DataFrame

    n : int, optional
        end size, by default 10

    Returns
    -------
    pd.DataFrame
        sampled dataframe
    """


    mid_df = df.iloc[n:-n]
    sampled_df = pd.concat(
        [df.head(n), mid_df.sample(frac=0.20, random_state=42), df.tail(n)],
        ignore_index=True
    )
    return sampled_df

In [7]:
def sample_EF_1D(df: pl.DataFrame, n: int = 10) -> pd.DataFrame:
    """sample a dataframe but favor the ends

    Parameters
    ----------
    df : pd.DataFrame

    n : int, optional
        end size, by default 10

    Returns
    -------
    pd.DataFrame
        sampled dataframe
    """

    

    mid_df = df.slice(n, len(df) - 2 * n)
    sampled_df = pl.concat(
        [df.head(n), mid_df.sample(fraction=0.20, seed=42), df.tail(n)],
        how="vertical",
    ).unpivot(index="timestep", 
              value_name="remaining", 
              variable_name="case")
    
    return sampled_df

In [8]:
def sample_EF_2D(df: pl.DataFrame, n: int = 10) -> pd.DataFrame:

    all_parts = [
        sample_EF_1D(df.select([col, "timestep"]), n=n).with_columns(
            pl.lit(col).alias("case")
        )
        for col in df.columns if col != "timestep"
    ]
    collected = pl.concat(all_parts, how="vertical")

    return collected.drop_nulls()

In [9]:
def add_type_um(df: pl.DataFrame) -> pl.DataFrame:

    return df.with_columns(
        pl.col("case").str.split("_").list.get(0).alias("type"),
        pl.col("case").str.split("_").list.get(1).alias("um"),
    )


In [10]:
df = sample_EF_2D(df, n=40)
df

timestep,case,remaining
i64,str,f64
1,"""uniform_250""",1.192739e6
2,"""uniform_250""",779689.0
3,"""uniform_250""",541090.0
4,"""uniform_250""",389455.0
5,"""uniform_250""",288740.0
…,…,…
996,"""3.50_60""",6.0
1061,"""3.50_60""",5.0
1124,"""3.50_60""",4.0
1379,"""3.50_60""",2.0


In [11]:
df = df.with_columns(
    pl.col("case").str.split("_").list.get(0).alias("distribution"),
    pl.col("case").str.split("_").list.get(1).alias("around"),
)

In [12]:
df.filter(pl.col("case") == "uniform_350").sort("timestep").tail()

timestep,case,remaining,distribution,around
i64,str,f64,str,str
962,"""uniform_350""",6.0,"""uniform""","""350"""
1026,"""uniform_350""",5.0,"""uniform""","""350"""
1153,"""uniform_350""",4.0,"""uniform""","""350"""
1499,"""uniform_350""",2.0,"""uniform""","""350"""
2038,"""uniform_350""",1.0,"""uniform""","""350"""


In [13]:
df.filter(pl.col("case") == "gaus_350").sort("timestep").tail()

timestep,case,remaining,distribution,around
i64,str,f64,str,str
1168,"""gaus_350""",5.0,"""gaus""","""350"""
1289,"""gaus_350""",4.0,"""gaus""","""350"""
1388,"""gaus_350""",3.0,"""gaus""","""350"""
1605,"""gaus_350""",2.0,"""gaus""","""350"""
2080,"""gaus_350""",1.0,"""gaus""","""350"""


In [14]:
cases = df.select("case").unique().to_series().to_list()
cases

['uniform_350', 'gaus_350', 'uniform_250', '3.50_60', 'gaus_250']

In [15]:
df.select("timestep").to_series().max()

2080

In [16]:
# make fit
tmax = df.select("timestep").to_series().max()
fits_arr = list()
fits = pl.DataFrame().with_columns(
    pl.int_range(1, tmax + 1).alias("timestep"),
)
cases = df.select("case").unique().to_series().to_list()
for c in cases:
    if not c == "3.50_60":
        case_df = df.filter(pl.col("case") == c)
        timestep = case_df.select("timestep").to_numpy()
        data = case_df.select("remaining").to_numpy()
    
        fits = fits.with_columns(
            pl.Series(value_fit(data, timestep, tmax=tmax, eq=tri_exp)).alias(f"{c}_tri.exp"),
            pl.Series(value_fit(data, timestep, tmax=tmax, eq=powerlaw)).alias(f"{c}_powerlaw"),
            pl.Series(value_fit(data, timestep, tmax=tmax, eq=powerlaw,sigma_w=True)).alias(f"{c}_powerlaw.weighted"),
        )

  return a * np.exp(-x * b) + c * np.exp(-x * d) + e * np.exp(-x * f)
  return a * x ** (-b)
  return a * x ** (-b)


In [17]:
fits

timestep,gaus_350_tri.exp,gaus_350_powerlaw,gaus_350_powerlaw.weighted,uniform_250_tri.exp,uniform_250_powerlaw,uniform_250_powerlaw.weighted,uniform_350_tri.exp,uniform_350_powerlaw,uniform_350_powerlaw.weighted,gaus_250_tri.exp,gaus_250_powerlaw,gaus_250_powerlaw.weighted
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,25514.630246,31916.090969,,1.1926e6,1.2926e6,,11832.233498,15731.226798,,2.6448e6,2.7771e6,
2,22247.21202,21017.945144,,780436.707191,625054.349329,,10714.146251,11100.63719,,1.4870e6,1.1333e6,
3,19574.471789,16461.396847,,539969.50762,408638.546806,,9779.927878,9052.65556,,871834.760306,670934.010626,
4,17371.847913,13841.106623,,389330.151503,302258.718352,,8993.945529,7833.091952,,524096.472221,462527.591245,
5,15542.309972,12099.482604,,289323.808374,239220.274884,,8327.776569,7001.459549,,320514.583742,346605.786576,
…,…,…,…,…,…,…,…,…,…,…,…,…
2076,,319.772944,1.005129,,430.870434,,,337.46967,,,142.726628,
2077,,319.68015,1.003818,,430.652989,,,337.387935,,,142.637784,
2078,,319.587427,1.002509,,430.435759,,,337.306259,,,142.549039,
2079,,319.494776,1.001203,,430.218743,,,337.224642,,,142.460391,


In [18]:
fitsm = fits.unpivot(index="timestep",
                     value_name="remaining",
                     variable_name="fit_case")

In [19]:
fitsm

timestep,fit_case,remaining
i64,str,f64
1,"""gaus_350_tri.exp""",25514.630246
2,"""gaus_350_tri.exp""",22247.21202
3,"""gaus_350_tri.exp""",19574.471789
4,"""gaus_350_tri.exp""",17371.847913
5,"""gaus_350_tri.exp""",15542.309972
…,…,…
2076,"""gaus_250_powerlaw.weighted""",
2077,"""gaus_250_powerlaw.weighted""",
2078,"""gaus_250_powerlaw.weighted""",
2079,"""gaus_250_powerlaw.weighted""",


In [20]:
fitsm = fitsm.with_columns(
    pl.col("fit_case").str.split("_").list.get(0).alias("distribution"),
    pl.col("fit_case").str.split("_").list.get(1).alias("around"),
    pl.col("fit_case").str.split("_").list.get(2).alias("equation")
)

In [21]:
gauss = fitsm.filter(pl.col("distribution")=="gaus")
uni = fitsm.filter(pl.col("distribution")=="uniform")

In [22]:
gauss.sample(12)

timestep,fit_case,remaining,distribution,around,equation
i64,str,f64,str,str,str
552,"""gaus_350_powerlaw.weighted""",36.43211,"""gaus""","""350""","""powerlaw.weighted"""
510,"""gaus_250_powerlaw.weighted""",,"""gaus""","""250""","""powerlaw.weighted"""
548,"""gaus_350_tri.exp""",,"""gaus""","""350""","""tri.exp"""
581,"""gaus_350_powerlaw""",688.882024,"""gaus""","""350""","""powerlaw"""
989,"""gaus_350_powerlaw.weighted""",7.499901,"""gaus""","""350""","""powerlaw.weighted"""
…,…,…,…,…,…
1364,"""gaus_250_powerlaw""",245.674282,"""gaus""","""250""","""powerlaw"""
419,"""gaus_250_tri.exp""",,"""gaus""","""250""","""tri.exp"""
10,"""gaus_250_tri.exp""",32574.19861,"""gaus""","""250""","""tri.exp"""
121,"""gaus_350_powerlaw.weighted""",2228.721206,"""gaus""","""350""","""powerlaw.weighted"""


In [23]:
df.sample(12)

timestep,case,remaining,distribution,around
i64,str,f64,str,str
220,"""uniform_350""",170.0,"""uniform""","""350"""
217,"""uniform_350""",199.0,"""uniform""","""350"""
223,"""gaus_350""",141.0,"""gaus""","""350"""
3,"""gaus_350""",19447.0,"""gaus""","""350"""
7,"""uniform_250""",170882.0,"""uniform""","""250"""
…,…,…,…,…
142,"""gaus_350""",295.0,"""gaus""","""350"""
692,"""3.50_60""",16.0,"""3.50""","""60"""
46,"""uniform_250""",1156.0,"""uniform""","""250"""
142,"""3.50_60""",361.0,"""3.50""","""60"""


In [24]:
df_gauss =df.filter(pl.col("distribution")=="gaus")

In [25]:
eqs = ["tri.exp","powerlaw","powerlaw.weighted"]

In [122]:
def plot_it(df:pl.DataFrame,fits:pl.DataFrame,color_by):

    return (ggplot()
            + geom_point(data=df, mapping=aes(x="timestep", y="remaining",fill=color_by),color="black",shape=21, stroke=0.5, size=4)
            + geom_line(data=fits, mapping=aes(x="timestep", y="remaining",color=color_by),size=2)
            + scale_color_brewer()
            + scale_fill_brewer()
            + scale_x_log10(format="~e")
            + scale_y_log10(format=".0~e")
            + theme_classic()
            + ggsize(800,400)
            + theme(legend_title=element_blank(), exponent_format="pow" )
            + lims(y=[0.5, float(10**7)], x=[0.5, 10_000])
            
            )

In [123]:
grid_gauss =[
        plot_it(df=df.filter(pl.col("distribution")=="gaus").sort("around"),
                fits=gauss.filter(pl.col("equation")==eq).sort("around"),
                color_by="around") 
                for eq in eqs 
                ]

In [124]:
grid_uni= [
        plot_it(df=df.filter(pl.col("distribution")=="uniform").sort("around"),
                fits=uni.filter(pl.col("equation")==eq).sort("around"),
                color_by="around")+ scale_color_viridis() + scale_fill_viridis()
                for eq in eqs 
                ]

In [125]:
all = gggrid(grid_gauss+grid_uni,ncol=3)

In [126]:
all

In [128]:
eq = "tri.exp"
(plot_it(df=df.filter(pl.col("distribution")=="uniform").sort("around"),
        fits=uni.filter(pl.col("equation")==eq).sort("around"),
        color_by="around")+ scale_color_viridis() + scale_fill_viridis()
        + ggsize(800,600)
)
        

In [129]:
(plot_it(df=df.filter(pl.col("distribution")=="uniform").sort("around"),
        fits=uni.filter(pl.col("equation")==eq).sort("around"),
        color_by="around")+ scale_color_viridis() + scale_fill_viridis()
        + ggsize(800,600)
).to_png("test.png")
        

'c:\\Users\\zafi_\\paper\\residence2\\5_distributedAffinity\\test.png'