# Process retention behavior

This notebook finds key time points in a read retention dataset

## Import libraries

External libraries: `pandas`.

In [55]:
# Import libaries
import pandas as pd

## Get retention data and preprocess

In [56]:
# Get retention data
names = ["addr", "time", "lvl"]
data = pd.read_csv("../data/retention/retention4.csv", delimiter='\t', names=names)
data["lvl"] = data["lvl"].str.strip('[]').str.split(', ')
data = data.explode("lvl", ignore_index=True)
data["lvl"] = pd.to_numeric(data["lvl"])
data = data[data["lvl"] != 0]
data["time"] -= data.groupby("addr")["time"].transform("first") 
data["lvli"] = data["addr"] % 64
data["i"] = data.index % 48
data

Unnamed: 0,addr,time,lvl,lvli,i
0,0,0.000000,11,0,0
1,0,0.000000,16,0,1
2,0,0.000000,13,0,2
3,0,0.000000,4,0,3
4,0,0.000000,14,0,4
...,...,...,...,...,...
37183962,1845,11923.895216,52,53,42
37183964,1845,11923.895216,49,53,44
37183965,1845,11923.895216,50,53,45
37183966,1845,11923.895216,50,53,46


## Select target times and write to minified file for caching

In [57]:
# Get target times
tdata = []
times = [0, 0.1, 1, 10, 100, 1000, 10000, 100000]
for time in times:
    idx = (data["time"] - time).abs().groupby([data["addr"], data["i"]]).idxmin()
    d = data[data.index.isin(idx)]
    d = d[(d["time"] <= time*1.1) & (d["time"] >= time*0.9)]
    print(time, len(d))
    d["timept"] = time
    tdata.append(d)
tdata = pd.concat(tdata)
tdata.to_csv("../data/retention/retention4.min.csv.gz")
tdata

0 104676
0.1 94302
1 104676
10 35685
100 103038
1000 102531
10000 89193
100000 0


Unnamed: 0,addr,time,lvl,lvli,i,timept
0,0,0.000000,11,0,0,0.0
1,0,0.000000,16,0,1,0.0
2,0,0.000000,13,0,2,0.0
3,0,0.000000,4,0,3,0.0
4,0,0.000000,14,0,4,0.0
...,...,...,...,...,...,...
37176522,2286,9012.017305,45,46,42,10000.0
37176524,2286,9012.017305,44,46,44,10000.0
37176525,2286,9012.017305,39,46,45,10000.0
37176526,2286,9012.017305,43,46,46,10000.0
