# Process retention behavior

This notebook finds key time points in a read retention dataset

## Import libraries

External libraries: `pandas`.

In [1]:
# Import libaries
import pandas as pd

## Get retention data and preprocess

In [2]:
# Get retention data
names = ["addr", "time", "lvl"]
data = pd.read_csv("../data/retention/retention4.csv.gz", delimiter='\t', names=names)
data["lvl"] = data["lvl"].str.strip('[]').str.split(', ')
data = data.explode("lvl", ignore_index=True)
data["lvl"] = pd.to_numeric(data["lvl"])
data = data[data["lvl"] != 0]
data["time"] -= data.groupby("addr")["time"].transform("first") 
data["lvli"] = data["addr"] % 64
data["i"] = data.index % 48
data

Unnamed: 0,addr,time,lvl,lvli,i
0,0,0.000000,13,0,0
1,0,0.000000,11,0,1
2,0,0.000000,8,0,2
3,0,0.000000,7,0,3
4,0,0.000000,19,0,4
...,...,...,...,...,...
18661002,754,1498.068309,47,50,42
18661004,754,1498.068309,46,50,44
18661005,754,1498.068309,46,50,45
18661006,754,1498.068309,49,50,46


## Select target times and write to minified file for caching

In [3]:
# Get target times
tdata = []
times = [0, 0.1, 1, 10, 100, 1000, 10000, 100000]
for time in times:
    idx = (data["time"] - time).abs().groupby([data["addr"], data["i"]]).idxmin()
    d = data[data.index.isin(idx)]
    d = d[(d["time"] <= time*1.1) & (d["time"] >= time*0.9)]
    print(time, len(d))
    d["timept"] = time
    tdata.append(d)
tdata = pd.concat(tdata)
tdata.to_csv("../data/retention/retention4.min.csv.gz")
tdata

0 32175
0.1 29172
1 32175
10 4797
100 31785
1000 30498
10000 6240
100000 0


Unnamed: 0,addr,time,lvl,lvli,i,timept
0,0,0.000000,13,0,0,0.0
1,0,0.000000,11,0,1,0.0
2,0,0.000000,8,0,2,0.0
3,0,0.000000,7,0,3,0.0
4,0,0.000000,19,0,4,0.0
...,...,...,...,...,...,...
18632442,159,9004.272041,25,31,42,10000.0
18632444,159,9004.272041,20,31,44,10000.0
18632445,159,9004.272041,26,31,45,10000.0
18632446,159,9004.272041,27,31,46,10000.0
