# Cancer Phylogenetics Project I - Data
# Spatial genetic diversity in hepacellular carcinoma

Based on data from Ling, S., Hu, Z., Yang, Z., Yang, F., Li, Y., Lin, P., Chen, K., Dong, L., Cao, L., Tao, Y., et al. (2015). Extremely high genetic diversity in a single tumor points to prevalence of non-Darwinian cell evolution. PNAS 112, E6496–E6505.

http://www.pnas.org/content/112/47/E6496.full

Data deposition: The sequence data reported in this paper have been deposited in the genome sequence archive of Beijing Institute of Genomics, Chinese Academy of Sciences, gsa.big.ac.cn (accession no. PRJCA000091).


## Raw Data
Data table S3 of the supplemental dataset (http://www.pnas.org/content/112/47/E6496.abstract?tab=ds) contains genotyping data on 286 individual punches collected from a cross-section of the tumor (~20,000 cells/punch) and from 1 normal sample. Whole exome sequencing was completed on 22 of these sections and used to identify 35 polymorphic nonsynonymous sites that screens with Sequenom genotyping. The data table records the average mutant allele frequency of each SNV for each site on the tumor. "-" and "F" indicate that the validation was not attempted or failed respectively. "P" indicates that validation was not attempted but the mutation is inferred to be present based on clonal boundaries.

## Data Parsing
The goal of this script is to make a FASTA formated alleles_fasta_meta.txt file in the following format for each sample: 


"> sample_A|clone|xcoord|ycoord


10000???1110000
"




alleles.csv is the raw .csv data converted from the supplemental excel sheet 
clones.csv are the assigned clones designated as greek letters by Ling et al.,
locations.csv are the x and y coordinates of each tumor sample represented by pixels extracted on preview of Figure 2A




In [2]:
import sys, csv, os
import pandas as pd
import numpy as np

In [4]:
with open("data/alleles.csv") as file:
    data = pd.read_csv(file, sep = ",", na_values = ['-','F'])
    data = data.dropna(axis = 1, how = "all") # deleted empty columns
    data = data.dropna(axis = 0, how = "all") # deleted empty rows
    data = data.replace(['P'], 1)
    data = data.apply(pd.to_numeric, errors = "ignore") # convert values to numeric (ignore column with site labels
    data_muts = data.iloc[:,1:]
    data_sites = data["Sites"]
    data_muts[data_muts <= 0.1] = 0 #value of less than 0.1 is considered negative for mut
    data_muts[data_muts > 0.1] = 1 # value of greater than 0.1 is considered positive for mut
    data_muts = data_muts.applymap(lambda x: '%.0f'% x)
    data_muts = data_muts.applymap(str)
    data = data.replace("nan", "-" ) #replace NaNs with ?
    data_cleaned = pd.concat([data_sites, data_muts], axis=1)  # put the site labels and mutation data back together
    data_cleaned = data_cleaned.applymap(str)
    data_cleaned = data_cleaned.replace("nan", "-" ) #replace NaNs with ? for BEAST compatible format

In [5]:
data_cleaned

Unnamed: 0,Sites,chrX_101139047,chr16_21698785,chr16_69988362,chr2_170031901,chr3_58155491,chr10_101969461,chr11_118375433,chr8_8998522,chr21_46306293,...,chr7_645878,chr17_38240142,chr9_123629149,chr7_72913004,chrX_65420433,chr6_108370471,chr7_44192950,chr18_46190149,chr5_161324267,chr12_9248193
0,A1,1,1,0,0,-,-,-,0,0,...,0,0,0,-,0,-,0,-,-,-
1,A11,1,1,0,0,-,-,-,0,0,...,0,0,0,-,0,-,0,-,-,-
2,A17,1,1,0,0,0,0,0,0,-,...,0,0,-,-,0,-,-,-,0,-
3,A23,1,1,0,0,-,-,-,0,0,...,0,0,0,-,0,-,0,-,-,-
4,A24,1,1,0,0,-,-,-,-,-,...,0,0,-,-,0,-,-,-,-,-
5,A30,1,1,0,0,-,-,-,-,-,...,0,0,-,-,0,-,-,-,-,-
6,A31,1,1,0,0,-,-,-,-,-,...,0,0,-,-,0,-,-,-,-,-
7,A32,1,1,0,0,-,-,-,0,0,...,0,0,0,-,0,-,0,-,-,-
8,A39,1,1,0,0,0,0,0,0,-,...,0,0,-,-,0,-,-,-,0,-
9,A40,1,1,0,0,-,-,-,-,-,...,0,0,-,-,0,-,-,-,-,-


In [6]:
with open("data/clones.csv") as file:
    meta = pd.read_csv(file, sep = ",",usecols = [1])
    meta = meta.replace(['np.nan','nan',"ungrouped"],"?")

In [7]:
with open("data/locations.csv") as file:
    locs = pd.read_csv(file, sep = ",",usecols = [1,2])
    dim_mm = 35
    dim_pix = 310.0
    zcoorx = (232.0 + 542.27)/2
    zcoory = (323.2 + 634)/2
    locs['X Coordinate'] = locs['X Coordinate'].map(lambda x: x - zcoorx)
    locs['Y Coordinate'] = locs['Y Coordinate'].map(lambda x: x - zcoory)
    locs = locs.applymap(lambda x: x / dim_pix * dim_mm)
    locs = locs.applymap(lambda x: "%.2f" % x)
    locs = locs.applymap(str)
    locs = locs.replace([np.nan,'nan'], "?")

In [8]:
data_meta = pd.concat([data_cleaned, meta, locs], axis = 1)

In [9]:
data_meta

Unnamed: 0,Sites,chrX_101139047,chr16_21698785,chr16_69988362,chr2_170031901,chr3_58155491,chr10_101969461,chr11_118375433,chr8_8998522,chr21_46306293,...,chr7_72913004,chrX_65420433,chr6_108370471,chr7_44192950,chr18_46190149,chr5_161324267,chr12_9248193,Clone,X Coordinate,Y Coordinate
0,A1,1,1,0,0,-,-,-,0,0,...,-,0,-,0,-,-,-,zeta,-15.71,-0.29
1,A11,1,1,0,0,-,-,-,0,0,...,-,0,-,0,-,-,-,zeta,-6.00,-1.87
2,A17,1,1,0,0,0,0,0,0,-,...,-,0,-,-,-,0,-,zeta,-7.92,-1.65
3,A23,1,1,0,0,-,-,-,0,0,...,-,0,-,0,-,-,-,zeta,-9.39,-1.08
4,A24,1,1,0,0,-,-,-,-,-,...,-,0,-,-,-,-,-,zeta,-8.14,-3.91
5,A30,1,1,0,0,-,-,-,-,-,...,-,0,-,-,-,-,-,zeta,-11.19,-0.97
6,A31,1,1,0,0,-,-,-,-,-,...,-,0,-,-,-,-,-,zeta,-10.74,-2.66
7,A32,1,1,0,0,-,-,-,0,0,...,-,0,-,0,-,-,-,zeta,-9.95,-4.58
8,A39,1,1,0,0,0,0,0,0,-,...,-,0,-,-,-,0,-,zeta,-13.00,-0.52
9,A40,1,1,0,0,-,-,-,-,-,...,-,0,-,-,-,-,-,zeta,-12.55,-1.76


In [10]:
data_array = data_meta.values

with open("data/alleles_fasta_meta.txt", "w") as file:
    for row in data_array:
        file.write(">" + str(row[0]) + "|" + str(row[-3]) + "|" + str(row[-2]) + "|" + str(row[-1]) + "\n")
        file.write("".join(row[1:-3]) + "\n")

In [11]:
data_meta.values

array([['A1', '1', '1', ..., 'zeta', '-15.71', '-0.29'],
       ['A11', '1', '1', ..., 'zeta', '-6.00', '-1.87'],
       ['A17', '1', '1', ..., 'zeta', '-7.92', '-1.65'],
       ..., 
       ['A42', '0', '0', ..., '?', '-10.85', '-4.81'],
       ['A44', '0', '0', ..., '?', '-9.05', '-7.52'],
       ['A64', '0', '0', ..., '?', '-11.98', '-9.44']], dtype=object)