# Description
This notebook extract useful metrics from the plaques dataset or some part of
it. It relies on trackmate results to extract this information. Three metrics
are considered: number of infected cells, radius of the plaque, and radial
velocity of infected cells. Only considering infected cells, because these are
the only cells visible in the microscopy images at hand. Metrics are modeled
using a mean and standard deviation for each point in time that there is image
for it. Time is in hours post infection (hpi). Results are saved into a csv file
to be used as a reference to evaluate simulations in infectio.

# Implementation

## Part 1: choose set of files

### For WR virus    

Note: **Quick Fix**

The dataset needs to be changed to be used here. Because in many of the
of the experiments, two or more initial spots are infected and therefore center
and radii computations are not correct. As a quick fix for now, I am considering
only a few of the experiements that consist only of one plaque. For M061-WR
these are: 6, 8, 9, 11, 13, 14.

In [2]:
import os
import pandas as pd
import numpy as np

dataset_name = 'M061_WR_handpicked'
CSV_ROOT = "../dataset/plaques-ashkan/trackmate_output/dVGF_dF11_viruses/M061"
# include only files in range of 1 to 15 in their names, these are basic WR files
# Only consider quick fix files
single_plaque_files = [6, 8, 9, 11, 13, 14]
csv_files = [f for f in os.listdir(CSV_ROOT) if f.endswith(".csv") and int(f.split("-")[0]) in single_plaque_files]
csv_files

['8-spots.csv',
 '11-spots.csv',
 '9-spots.csv',
 '13-spots.csv',
 '6-spots.csv',
 '14-spots.csv']

### For dVGF/dF11

In [3]:
import os
import pandas as pd
import numpy as np

dataset_name = 'M061_dVGFdF11_handpicked'
CSV_ROOT = "../dataset/plaques-ashkan/trackmate_output/dVGF_dF11_viruses/M061"
# include only files in range of 46 to 60 in their names, these are dVGF/dF11 files
# Only consider quick fix files
single_plaque_files = [46, 48, 49, 50, 51, 52, 53, 55, 57, 58, 60]  # exclude 54, 56, 59, also 47 because first few frames not enough (less than 3) spots
csv_files = [f for f in os.listdir(CSV_ROOT) if f.endswith(".csv") and int(f.split("-")[0]) in single_plaque_files]
csv_files

['55-spots.csv',
 '60-spots.csv',
 '58-spots.csv',
 '53-spots.csv',
 '46-spots.csv',
 '52-spots.csv',
 '57-spots.csv',
 '51-spots.csv',
 '48-spots.csv',
 '50-spots.csv',
 '49-spots.csv']

## Part 2: add the time stamps of the time series data

In [4]:
# Because the imaging of the dataset starts with 20 h.p.i and ends 48 hpi with
# 10 minute intervals
time_stamps = [round(x, 2) for x in np.linspace(20.0, 48.0, 169).tolist()]
refdf = pd.DataFrame({'t': time_stamps})

print(refdf)

         t
0    20.00
1    20.17
2    20.33
3    20.50
4    20.67
..     ...
164  47.33
165  47.50
166  47.67
167  47.83
168  48.00

[169 rows x 1 columns]


## Part 3: infected count metrics

In [5]:
unique_track_id_counts = []

for file in csv_files:
    df = pd.read_csv(os.path.join(CSV_ROOT, file), skiprows=[1, 2, 3], low_memory=False)
    unique_counts = df.groupby('FRAME')['TRACK_ID'].nunique()
    unique_track_id_counts.append(unique_counts)

all_counts_df = pd.concat(unique_track_id_counts, axis=1)

# Calculate average and standard deviation for each frame
average_counts = all_counts_df.mean(axis=1)
std_dev_counts = all_counts_df.std(axis=1)

print(average_counts, std_dev_counts)

FRAME
0       13.545455
1       14.000000
2       14.545455
3       15.181818
4       16.090909
          ...    
164    519.545455
165    525.000000
166    530.909091
167    537.000000
168    532.727273
Length: 169, dtype: float64 FRAME
0        8.029491
1        8.473488
2        9.070431
3        9.400193
4        8.971673
          ...    
164    190.898593
165    194.767554
166    194.594684
167    198.061102
168    195.820372
Length: 169, dtype: float64


In [6]:
# Adding count values to refdf
refdf['inf-count-mean'] = average_counts
refdf['inf-count-std'] = std_dev_counts
refdf

Unnamed: 0,t,inf-count-mean,inf-count-std
0,20.00,13.545455,8.029491
1,20.17,14.000000,8.473488
2,20.33,14.545455,9.070431
3,20.50,15.181818,9.400193
4,20.67,16.090909,8.971673
...,...,...,...
164,47.33,519.545455,190.898593
165,47.50,525.000000,194.767554
166,47.67,530.909091,194.594684
167,47.83,537.000000,198.061102


## Part 4: Area & radius reference metrics
Area would be a better metric instead of radius.

In [7]:
from scipy.spatial import ConvexHull

def get_convex_radius(points):
    if len(points) < 3:
        return 0
    hull = ConvexHull(points)
    boundary_points = points[hull.vertices]
    center = np.mean(boundary_points, axis=0)
    radii = (boundary_points - center)
    radii = np.linalg.norm(radii, axis=1)
    return radii.mean()

def get_area(points):
    if len(points) < 3:
        return 0
    hull = ConvexHull(points)
    return hull.volume  # Remember that .volume method gives the area in 2d and not .area

In [8]:
all_radii_stats = []
all_area_stats = []

for file in csv_files:
    df = pd.read_csv(os.path.join(CSV_ROOT, file), skiprows=[1, 2, 3], low_memory=False)
    points_vs_frame = df.groupby('FRAME').apply(lambda x: x[['POSITION_X', 'POSITION_Y']].values)
    radii_vs_frame = [get_convex_radius(points) for points in points_vs_frame]
    area_vs_frame = [get_area(points) for points in points_vs_frame]
    # remove any zeros in there
    radii_vs_frame = [r for r in radii_vs_frame if r != 0]
    all_radii_stats.append(radii_vs_frame)
    all_area_stats.append(area_vs_frame)

In [9]:
array_of_lists = np.array(all_radii_stats) # (Correction: apparently Fiji already counts in the pixel length and width) * 3.1746  # because this dataset, both pixel width and height are this number in microns. 
mean_radii = array_of_lists.mean(axis=0)
std_radii = array_of_lists.std(axis=0)

refdf['radius-mean(um)'] = mean_radii
refdf['radius-std(um)'] = std_radii

area_array_of_lists = np.array(all_area_stats)
mean_area = area_array_of_lists.mean(axis=0)
std_area = area_array_of_lists.std(axis=0)

refdf['area-mean(um2)'] = mean_area
refdf['area-std(um2)'] = std_area

refdf

Unnamed: 0,t,inf-count-mean,inf-count-std,radius-mean(um),radius-std(um),area-mean(um2),area-std(um2)
0,20.00,13.545455,8.029491,99.765082,55.899142,17840.793909,13641.408579
1,20.17,14.000000,8.473488,98.197804,52.199483,18210.787124,13788.443193
2,20.33,14.545455,9.070431,98.277900,50.936504,18643.611634,14094.592710
3,20.50,15.181818,9.400193,85.748698,45.349977,18584.707067,15350.915225
4,20.67,16.090909,8.971673,88.255967,47.372948,19391.190377,15336.910131
...,...,...,...,...,...,...,...
164,47.33,519.545455,190.898593,450.418243,115.710533,660882.906380,303277.539930
165,47.50,525.000000,194.767554,451.823065,82.490207,653435.500620,215088.553691
166,47.67,530.909091,194.594684,458.394335,97.378303,669596.662250,232589.583245
167,47.83,537.000000,198.061102,479.244818,113.513953,674617.706849,234119.121754


## Part 5: Radial velocity reference metrics

In [53]:
# 1. for each file:
# 2. compute the center of for all infected points in the first frame and consider it as center
# 3. for each trackid, compute maximum radial velocity
# 4. average all of these values to get the radial velocity for the file

def max_upto_each_index(arr):
    result = []
    for i in range(len(arr)):
        result.append(np.max(arr[:i+1]))
    return np.array(result)

def compute_radial_velocity_of_trackmate_csvfile(csv_file):
    df = pd.read_csv(csv_file, skiprows=[1, 2, 3], low_memory=False)
    firstframe_points = df[df['FRAME'] == 0][['POSITION_X', 'POSITION_Y']].values
    center = np.mean(firstframe_points, axis=0)
    max_frame = df['FRAME'].max()
    max_radial_velocities = []
    
    trackids = df['TRACK_ID'].unique()
    for trackid in trackids:
        points_framesorted = df[df['TRACK_ID'] == trackid][['POSITION_X', 'POSITION_Y', 'FRAME']].sort_values(by=['FRAME'])
        frames = points_framesorted['FRAME'].values
        points_framesorted = points_framesorted[['POSITION_X', 'POSITION_Y']].values
        radial_vector = points_framesorted - center
        unit_radial_vectors = radial_vector / np.linalg.norm(radial_vector, axis=1, keepdims=True)
        forward_velocity = np.diff(points_framesorted, axis=0)
        radial_velocity = np.sum(forward_velocity * unit_radial_vectors[:-1], axis=1)
        # we need to compute values for each frame and max upto that frame
        max_radial_velocity = np.zeros(max_frame + 1)
        for idx, val in zip(frames, radial_velocity):
            max_radial_velocity[idx] = val
        max_radial_velocity = max_upto_each_index(max_radial_velocity)
        max_radial_velocities.append(max_radial_velocity)
    max_radial_velocities = np.array(max_radial_velocities).mean(axis=0)
    return max_radial_velocities

In [55]:
all_radial_velocities = []

for file in csv_files:
    radial_velocity = compute_radial_velocity_of_trackmate_csvfile(os.path.join(CSV_ROOT, file))
    all_radial_velocities.append(radial_velocity)
all_radial_velocities = np.array(all_radial_velocities)
# divide everything by 10 because these numbers are um/1Frame and 1Frame=10min and we want um/min
all_radial_velocities = all_radial_velocities / 10

# replace 0 with nan so we can compute the mean and std without including zeros
all_radial_velocities[all_radial_velocities == 0] = np.nan
mean_radial_velocity = np.nanmean(all_radial_velocities, axis=0)
std_radial_velocity = np.nanstd(all_radial_velocities, axis=0)

refdf['radial-velocity-mean(um/min)'] = mean_radial_velocity
refdf['radial-velocity-std(um/min)'] = std_radial_velocity

refdf

Unnamed: 0,t,inf-count-mean,inf-count-std,radius-mean(um),radius-std(um),area-mean(um2),area-std(um2),radial-velocity-mean(um/min),radial-velocity-std(um/min)
0,20.00,13.545455,8.029491,99.765082,55.899142,17840.793909,13641.408579,0.000847,0.001134
1,20.17,14.000000,8.473488,98.197804,52.199483,18210.787124,13788.443193,0.002189,0.001182
2,20.33,14.545455,9.070431,98.277900,50.936504,18643.611634,14094.592710,0.002861,0.001252
3,20.50,15.181818,9.400193,85.748698,45.349977,18584.707067,15350.915225,0.003411,0.001409
4,20.67,16.090909,8.971673,88.255967,47.372948,19391.190377,15336.910131,0.003865,0.001478
...,...,...,...,...,...,...,...,...,...
164,47.33,519.545455,190.898593,450.418243,115.710533,660882.906380,303277.539930,0.273864,0.026589
165,47.50,525.000000,194.767554,451.823065,82.490207,653435.500620,215088.553691,0.278101,0.028066
166,47.67,530.909091,194.594684,458.394335,97.378303,669596.662250,232589.583245,0.281426,0.027150
167,47.83,537.000000,198.061102,479.244818,113.513953,674617.706849,234119.121754,0.284152,0.027569


## Part 6: Saving the reference metrics

In [56]:
save_path = os.path.join('..', 'output', 'reference_metrics_for_'+ dataset_name + '.csv')
refdf.to_csv(save_path, index=False)