# Binary Segmentation - converts a frequency histogram CSV into time segments

In [96]:
### Import necessary modules
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt
import csv

In [150]:
### Set parameters
PID = 'p4'
dataset = '3'

n, dim = 500, 3  # number of samples
n_bkps, sigma = 3, 5  # number of change points, noise standart deviation
signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma)

In [151]:
### Open file
file = open("newCSV/Dataset_" + dataset + "/" + PID + ".csv")
signal = np.loadtxt(file, delimiter=",")

In [152]:
### Generate segments
# change point detection
model = "l2"  # "l1", "rbf", "linear", "normal", "ar"
algo = rpt.Binseg(model=model, jump=1).fit(signal)
my_bkps = algo.predict(n_bkps=10)

# show results
print(my_bkps)

[160, 340, 405, 473, 545, 571, 593, 12466, 12508, 12549, 12635]


In [153]:
### Convert breakpoint indices into times:
timeArr = []
for bkp in my_bkps:
    timeArr.append(signal[bkp-1][0] / 10)
print(timeArr)

[1107.6, 1744.8, 2236.8, 2870.4, 3282.9, 3713.3, 4019.9, 4408.8, 4758.0, 5175.0, 5737.8]


In [154]:
### Export segments to CSV
startText = ['Arms', 'Terrorist', 'Disappearance']
outFilename = 'ProvenanceSummaries/code/ProvSegments/Dataset_' + dataset + '/Segmentation/' + startText[int(dataset) - 1] + '_P' + PID[1] + '_20_4_6_Prov_Segments.csv'
f = open(outFilename, 'w', newline='')

writer = csv.writer(f)
header = ['ID', 'start', 'end', 'length']
writer.writerow(header)

row = [0, 0, timeArr[0], timeArr[0]]
writer.writerow(row)
for i in range(1,11):
    row = [i, timeArr[i-1], timeArr[i], timeArr[i] - timeArr[i-1]]
    writer.writerow(row)
f.close()
