In [None]:
import pandas as pd

# Function Segments

In [None]:
def filterout_sex_chromosome(data):
    '''
    Function to filter out the chrX and chrY from the dataframe
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    Output:
    data (dataframe): Allelic specific copy number dataframe without chrX and chrY
    '''
    return data[(data['Chromosome'] != 'chrY') & (data['Chromosome'] != 'chrX')]

def annotateSegmentCopyNeutralState(data):
    '''
    Annotates the copy number dataframe with a new column called Copy_Neutral indicating if a segment is CN-AMP, CN-GAIN, CN-LOH or None
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    Output:
    data (dataframe): Allelic specific copy number dataframe with new column called Copy_Neutral
    '''
    data.loc[:, 'Copy_Neutral'] = 'None'
    
    data.loc[(data['Major_Copy_Number'] > 4) & (data['Minor_Copy_Number'] == 0), 'Copy_Neutral'] = 'CN-AMP'
    data.loc[(data['Major_Copy_Number'] == 0) & (data['Minor_Copy_Number'] > 4), 'Copy_Neutral'] = 'CN-AMP'
    
    data.loc[((data['Major_Copy_Number'] == 3) & (data['Minor_Copy_Number'] == 0)) |
             ((data['Major_Copy_Number'] == 4) & (data['Minor_Copy_Number'] == 0)) |
             ((data['Major_Copy_Number'] == 0) & ((data['Minor_Copy_Number'] == 3) | (data['Minor_Copy_Number'] == 4))), 'Copy_Neutral'] = 'CN-GAIN'
    
    data.loc[((data['Major_Copy_Number'] == 2) & (data['Minor_Copy_Number'] == 0)) |
             ((data['Major_Copy_Number'] == 0) & (data['Minor_Copy_Number'] == 2)), 'Copy_Neutral'] = 'CN-LOH'
    
    
 

    
    return data

def annotateSegmentLength(data):
    '''
    Adds the length of the segments to the copy number dataframe.
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    Output:
    data (dataframe): Allelic specific copy number dataframe with new column called Length
    '''
    data.loc[:, 'Length'] = data['End'] - data['Start']
    return data
    
    
    

### Not needed anymore as this variables are directly added while creating the instance
def createPatient(filename):
    '''
    Creates a Patient object for the given sample
    Input:
    filename (string): Name of the allelic specific copy number file ('patient name')
    Output:
    patient (Patient object)
    '''
    patient = Patient()
    patient.name = filename
    return patient

### Not needed anymore as this variables are directly added while creating the instance
def createArm(nr_chromosome, arm_start, arm_end, arm_length):
    '''
    Creates an chromosom arm object with given start, end and length
    Input:
    arm_start (Int): Start position of the arm
    arm_end (Int): End position of the arm
    arm_length (Int): Length of the arm
    Output:
    arm (ChrArm object)
    '''
    arm = ChrArm()
    arm.chromosome_number = nr_chromosome
    arm.start = arm_start
    arm.end = arm_end
    arm.length = arm_length
    
    return arm



# Functions for both Arm and Cytoband

In [None]:
def compute_cin_amp(seg_data):
    '''
    Calculate the number of amplifications on a arm, cytoband (or chromosome).
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm or cytoband
    Output:
    (Int): Number of amplifications
    '''
    amplifications = seg_data[seg_data['Copy_Number'] > 2]['Copy_Number'] - 2
    
    return amplifications.sum()

def compute_cin_del(seg_data):
    '''
    Calculate the number of deletions on a arm, cytoband (or chromosome).
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm or cytoband
    Output:
    (Int): Number of deletions
    '''
    deletions = 2 - seg_data[seg_data['Copy_Number'] < 2]['Copy_Number']
    
    return deletions.sum()

def compute_cin(amplifications, deletions):
    '''
    Calculate the CIN of an arm or cytoband or chromosome
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm or cytoband
    Output:
    (Int): CIN
    '''
    return amplifications+deletions


def filter_by_chromosome(data, nr_chromosome):
    '''
    Extracts subdf of the data to only contain the given chromosome
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    nr_chromosome (string): chromosome number like 'chr1'
    Output:
    (dataframe): Filtered dataframe
    '''
    return data[data['Chromosome'] == nr_chromosome]

def filter_by_segments(chr_data, segment_indices):
    '''
    Extracts subdf of the chr_data (data only from one chromosome) to only contain the segments from the given index list.
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    segment_indices (list of Int): List containing the indices of the segments that need to be extracted
    Output:
    (dataframe): Filtered dataframe
    '''
    return chr_data.loc[segment_indices]

# Functions Arms

In [None]:
## Not used any more
def initiateBothChrArms(chromosome_data, nr_chromosome):
    '''
    Initiates both arms (p and q) for a given chromosome
    Input:
    chromosome_data (dataframe): Dataframe containing information about the chromosomes such as arm length ect. (look at Chromosome_data.ipynb)
    nr_chromosome (string): chromosome number like 'chr1'
    Output:
    p_arm (ChrArm object): The smaller p arm of the chromosome
    q_arm (ChrArm object): The larger q arm of the chromosome
    '''
    
    p_start = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['start_Sarm_p'].values[0]
    p_end = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['end_Sarm_p'].values[0]
    p_length = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['length_Sarm_p'].values[0]
    
    q_start = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['start_Larm_q'].values[0]
    q_end = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['end_Larm_q'].values[0]
    q_length = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['length_Larm_q'].values[0]
    
    p_arm = ChrArm('p',nr_chromosome, p_start, p_end, p_length)
    q_arm = ChrArm('q', nr_chromosome, q_start, q_end, q_length)
    
    #p_arm = createArm(nr_chromosome, p_start, p_end, p_length)
    #q_arm = createArm(nr_chromosome, q_start, q_end, q_length)
    
    return p_arm, q_arm



def addSegmentIndicesToArm(chr_data, arm):
    '''
    Adds a list of indices to an arm object. The indices correspond to the segments that are on the arm.
    It also includes segments that only start or only end in the arm.
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    arm (ChrArm object): Arm for which the indices are added
    Output:
    arm (ChrArm object): Arm with added indices list
    '''
    
    arm_start = arm.start
    arm_end = arm.end
    
    #sub_chr_df = data[data['Chromosome'] == arm.chromosome_number]
    
    mask = ((chr_data['Start'] >= arm_start) & (chr_data['Start'] <= arm_end)) | ((chr_data['End'] >= arm_start) & (chr_data['End'] <= arm_end))
    
    row_indices = chr_data.index[mask].tolist()
    
    arm.segments_indices = row_indices
    
    return arm


def add_amp_del_cin_to_arm(seg_data, arm):
    '''
    Calculates the number of amplifications, deletions and the CIN for a given arm and adds it to the object
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm
    arm (ChrArm object)
    Output
    arm (ChrArm object): With the added amp, del and cin
    '''
    
    amplifications = compute_cin_amp(seg_data)
    deletions = compute_cin_del(seg_data)
    cin = compute_cin(amplifications, deletions)
    
    arm.amplifications = amplifications
    arm.deletions = deletions
    arm.cin = cin
    
    arm.norm_cin = arm.cin / arm.length
    
    return arm
    
def add_cn_numbers_to_arm(seg_data, arm):
    '''
    Adds the number of copy neutral events (AMP, GAIN, LOH) in the segments of a arm to the arm object.
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm
    arm (ChrArm object)
    Output:
    arm (ChrArm object): Arm with the added counts
    '''
    filtered_seg_data = seg_data[seg_data['Copy_Neutral'] != 'None']

    value_counts = filtered_seg_data['Copy_Neutral'].value_counts()
    
    cn_loh_count = value_counts.get('CN-LOH', 0)
    cn_amp_count = value_counts.get('CN-AMP', 0)
    cn_gain_count = value_counts.get('CN-GAIN', 0)
    
    arm.number_cn_loh = cn_loh_count
    arm.number_cn_amp = cn_amp_count
    arm.number_cn_gain = cn_gain_count
    
    return arm

def compute_cn_status_arm(seg_data, arm, threshold = 90):
    '''
    Calculates the cn status of the arm. The Copy neutral event needs to cover at least threshold precentage of the arm that the arm gets thes status.
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the arm
    arm (ChrArm object)
    threshold (Int): Minimum precentage that needs to be covered by the segments that the arm as a whole has a CN status
    Output
    arm (ChrArm object): With the added status (None or CN-AMP or CN-GAIN or CN-LOH)
    '''
    
    length_arm = arm.length
    
    if arm.number_cn_amp == 0 and arm.number_cn_gain == 0 and arm.number_cn_loh == 0:
        arm.cn_status = 'None'
        return arm
        
    cn_percentages = dict()
    
    if not arm.number_cn_amp == 0:
        seg_amp = seg_data[seg_data['Copy_Neutral'] == 'CN-AMP']
        total_length = seg_amp['Length'].sum()
        cn_percentages['CN-AMP'] = (total_length/length_arm) * 100
    
    if not arm.number_cn_gain == 0:
        seg_gain = seg_data[seg_data['Copy_Neutral'] == 'CN-GAIN']
        total_length = seg_gain['Length'].sum()
        cn_percentages['CN-GAIN'] = (total_length/length_arm) * 100
    
    if not arm.number_cn_loh == 0:
        seg_loh = seg_data[seg_data['Copy_Neutral'] == 'CN-LOH']
        total_length = seg_loh['Length'].sum()
        cn_percentages['CN-LOH'] = (total_length/length_arm) * 100
    
    max_percentage = max(cn_percentages.values())
    if max_percentage <= threshold:
        arm.cn_status = 'None'
        return arm
    else:            
        status = max(cn_percentages, key=lambda k: cn_percentages[k])
        arm.cn_status = status
    
    return arm

def createBothArms(data, chromosome_data, nr_chromosome):
    '''
    Creates both arms of a given chromosome and adds/calculates all attributes
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    chromosome_data (dataframe): Dataframe containing information about the chromosomes such as arm length ect. (look at Chromosome_data.ipynb)
    nr_chromosome (string): chromosome number like 'chr1'
    Output:
    p_arm (ChrArm object): The smaller p arm of the chromosome
    q_arm (ChrArm object): The larger q arm of the chromosome
    '''
    
    # arm_p, arm_q = initiateBothChrArms(chromosome_data, nr_chromosome)
    
    p_start = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['start_Sarm_p'].values[0]
    p_end = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['end_Sarm_p'].values[0]
    p_length = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['length_Sarm_p'].values[0]
    
    q_start = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['start_Larm_q'].values[0]
    q_end = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['end_Larm_q'].values[0]
    q_length = chromosome_data[chromosome_data['Chromosome'] == nr_chromosome]['length_Larm_q'].values[0]
    
    arm_p = ChrArm('p',nr_chromosome, p_start, p_end, p_length)
    arm_q = ChrArm('q', nr_chromosome, q_start, q_end, q_length)
    
    chr_data = filter_by_chromosome(data, nr_chromosome)
    
    arm_p = addSegmentIndicesToArm(chr_data, arm_p)
    arm_q = addSegmentIndicesToArm(chr_data, arm_q)
    
    seg_data_p = filter_by_segments(chr_data, arm_p.segments_indices)
    seg_data_q = filter_by_segments(chr_data, arm_q.segments_indices)
    
    arm_p = add_amp_del_cin_to_arm(seg_data_p, arm_p)
    arm_q = add_amp_del_cin_to_arm(seg_data_q, arm_q)
    
    arm_p = add_cn_numbers_to_arm(seg_data_p, arm_p)
    arm_q = add_cn_numbers_to_arm(seg_data_q, arm_q)
    
    arm_p = compute_cn_status_arm(seg_data_p, arm_p, threshold = 90)
    arm_q = compute_cn_status_arm(seg_data_q, arm_q, threshold = 90)
    
    
    return arm_p, arm_q

# Functions Cytoband

In [None]:
## Not used anymore
def initiateAllCytobands(cytoband_data, nr_chromosome):
    '''
    Creates all Cytoband object of the given chromosome
    Input:
    cytoband_data (dataframe): Cytoband info dataframe
    nr_chromosome (string): chromosome number like 'chr1'
    Output:
    cytoband_list (List of Cytoband objects)
    '''
    
    cytoband_list = list()
    
    cytoband_data_chr = cytoband_data[cytoband_data['chromosome'] == nr_chromosome]
    cytoband_nr_list = list(cytoband_data_chr['cytoband'])
    
    for cyto_number in cytoband_nr_list:
        
        start = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['start'].values[0]
        end = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['end'].values[0]
        length = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['length'].values[0]
        
        cytoband = Cytoband(cyto_number, nr_chromosome, start, end, length)
        
        cytoband_list.append(cytoband)
    
    return cytoband_list

def addSegmentIndicesToCytoband(chr_data, cytoband):
    '''
    Adds a list of indices to an cytoband object. The indices correspond to the segments that are on the cytoband.
    It also includes segments that only start or only end in the cytoband.
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    cytoband (Cytoband object): Cytoband for which the indices are added
    Output:
    cytoband (Cytoband object): Cytoband with added indices list
    '''
    
    cytoband_start = cytoband.start
    cytoband_end = cytoband.end
    
    mask = ((chr_data['Start'] >= cytoband_start) & (chr_data['Start'] <= cytoband_end)) | ((chr_data['End'] >= cytoband_start) & (chr_data['End'] <= cytoband_end))
    
    row_indices = chr_data.index[mask].tolist()
    
    cytoband.segments_indices = row_indices
    
    return cytoband

def add_amp_del_cin_to_cytoband(seg_data, cytoband):
    '''
    Calculates the number of amplifications, deletions and the CIN for a given cytoband and adds it to the object
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the cytoband
    cytoband (Cytoband object)
    Output
    cytoband (Cytoband object): With the added amp, del and cin
    '''
    
    amplifications = compute_cin_amp(seg_data)
    deletions = compute_cin_del(seg_data)
    cin = compute_cin(amplifications, deletions)
    
    cytoband.amplifications = amplifications
    cytoband.deletions = deletions
    cytoband.cin = cin
    
    cytoband.norm_cin = cytoband.cin / cytoband.length
    
    return cytoband

def add_cn_numbers_to_cytoband(seg_data, cytoband):
    '''
    Adds the number of copy neutral events (AMP, GAIN, LOH) in the segments of a Cytoband to the Cytoband object.
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the Cytoband
    cytoband (Cytoband object)
    Output:
    cytoband (Cytoband object): Cytoband with the added counts
    '''
    filtered_seg_data = seg_data[seg_data['Copy_Neutral'] != 'None']

    value_counts = filtered_seg_data['Copy_Neutral'].value_counts()
    
    cn_loh_count = value_counts.get('CN-LOH', 0)
    cn_amp_count = value_counts.get('CN-AMP', 0)
    cn_gain_count = value_counts.get('CN-GAIN', 0)
    
    cytoband.number_cn_loh = cn_loh_count
    cytoband.number_cn_amp = cn_amp_count
    cytoband.number_cn_gain = cn_gain_count
    
    return cytoband

def compute_cn_status_cytoband(seg_data, cytoband, threshold = 90):
    '''
    Calculates the cn status of the cytoband. The Copy neutral event needs to cover at least threshold precentage of the cytoband that the cytoband gets thes status.
    Input:
    seg_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome containing the segments on the cytoband
    cytoband (Cytoband object)
    threshold (Int): Minimum precentage that needs to be covered by the segments that the cytoband as a whole has a CN status
    Output
    cytoband (Cytoband object): With the added status (None or CN-AMP or CN-GAIN or CN-LOH)
    '''
    
    length_cytoband = cytoband.length
    
    if cytoband.number_cn_amp == 0 and cytoband.number_cn_gain == 0 and cytoband.number_cn_loh == 0:
        cytoband.cn_status = 'None'
        return cytoband
        
    cn_percentages = dict()
    
    if not cytoband.number_cn_amp == 0:
        seg_amp = seg_data[seg_data['Copy_Neutral'] == 'CN-AMP']
        total_length = seg_amp['Length'].sum()
        cn_percentages['CN-AMP'] = (total_length/length_cytoband) * 100
    
    if not cytoband.number_cn_gain == 0:
        seg_gain = seg_data[seg_data['Copy_Neutral'] == 'CN-GAIN']
        total_length = seg_gain['Length'].sum()
        cn_percentages['CN-GAIN'] = (total_length/length_cytoband) * 100
    
    if not cytoband.number_cn_loh == 0:
        seg_loh = seg_data[seg_data['Copy_Neutral'] == 'CN-LOH']
        total_length = seg_loh['Length'].sum()
        cn_percentages['CN-LOH'] = (total_length/length_cytoband) * 100
    
    max_percentage = max(cn_percentages.values())
    if max_percentage <= threshold:
        cytoband.cn_status = 'None'
        return cytoband
    else:            
        status = max(cn_percentages, key=lambda k: cn_percentages[k])
        cytoband.cn_status = status
    
    return cytoband


def createAllCytoband_of_chr(data, cytoband_data, nr_chromosome):
    '''
    Creates all cytobands of one chromosome and adds/calculates all attributes
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    cytoband_data (dataframe): Cytoband info dataframe
    nr_chromosome (string): chromosome number like 'chr1'
    Output:
    cytoband_list (list of Cytoband objects)
    '''
    
    #cytoband_list = initiateAllCytobands(cytobands_data, nr_chromosome)
    
    chr_data = filter_by_chromosome(data, nr_chromosome)
    
    cytoband_list = list()
    
    cytoband_data_chr = cytoband_data[cytoband_data['chromosome'] == nr_chromosome]
    cytoband_nr_list = list(cytoband_data_chr['cytoband'])
    
    for cyto_number in cytoband_nr_list:
        
        start = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['start'].values[0]
        end = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['end'].values[0]
        length = cytoband_data_chr[cytoband_data_chr['cytoband'] == cyto_number]['length'].values[0]
        
        cytoband = Cytoband(cyto_number, nr_chromosome, start, end, length)
        
    
    #for i in range(0,len(cytoband_list)):
        #cytoband = cytoband_list[i]

        cytoband = addSegmentIndicesToCytoband(chr_data, cytoband)

        seg_data = filter_by_segments(chr_data, cytoband.segments_indices)
        
        cytoband = add_amp_del_cin_to_cytoband(seg_data, cytoband)

        cytoband = add_cn_numbers_to_cytoband(seg_data, cytoband)

        cytoband = compute_cn_status_cytoband(seg_data, cytoband, threshold = 90)

        #cytoband_list[i] = cytoband
        cytoband_list.append(cytoband)

    return cytoband_list

# Function Chromosome

In [None]:
## not used anymore
def initiateAllChromosome(chromosome_data):
    '''
    Creates a list with all chromosomes
    Input:
    chromosome_data (dataframe): Dataframe containing information about the chromosomes such as arm length ect. (look at Chromosome_data.ipynb)
    Output:
    chromosome_list (List of Chromosome objects)
    '''
    
    chromosome_list = list()
    
    chromosome_nr_list = list(chromosome_data['Chromosome'])
    
    for chr_nr in chromosome_nr_list:
        
        length = chromosome_data[chromosome_data['Chromosome'] == chr_nr]['length'].values[0]
        
        chromosome = Chromosome(chr_nr, length)
        
        chromosome_list.append(chromosome)
    
    return chromosome_list

def add_amp_del_cin_to_chromosome(chr_data, chromosome):
    '''
    Calculates the number of amplifications, deletions and the CIN for a given chromosome and adds it to the object
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    chromosome (Chromosome object)
    Output
    chromosome (Chromosome object): With the added amp, del and cin
    '''
    
    amplifications = compute_cin_amp(chr_data)
    deletions = compute_cin_del(chr_data)
    cin = compute_cin(amplifications, deletions)
    
    chromosome.amplifications = amplifications
    chromosome.deletions = deletions
    chromosome.cin = cin
    
    chromosome.norm_cin = chromosome.cin / chromosome.length
    
    return chromosome

def add_cn_numbers_to_chromosome(chr_data, chromosome):
    '''
    Adds the number of copy neutral events (AMP, GAIN, LOH) of the chromosome to the chromosome object.
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    chromosome (Chromosome object)
    Output:
    chromosome (Chromosome object): Chromosome with the added counts
    '''
    filtered_chr_data = chr_data[chr_data['Copy_Neutral'] != 'None']

    value_counts = filtered_chr_data['Copy_Neutral'].value_counts()
    
    cn_loh_count = value_counts.get('CN-LOH', 0)
    cn_amp_count = value_counts.get('CN-AMP', 0)
    cn_gain_count = value_counts.get('CN-GAIN', 0)
    
    chromosome.number_cn_loh = cn_loh_count
    chromosome.number_cn_amp = cn_amp_count
    chromosome.number_cn_gain = cn_gain_count
    
    return chromosome

def compute_cn_status_chromosome(chr_data, chromosome, threshold = 90):
    '''
    Calculates the cn status of the chromosome. The Copy neutral event needs to cover at least threshold precentage of the chromosome that the chromosome gets thes status.
    Input:
    chr_data (dataframe): Allelic specific copy number dataframe of one sample for one chromosome
    chromosome (Chromosome object)
    threshold (Int): Minimum precentage that needs to be covered by the segments that the chromosome as a whole has a CN status
    Output
    chromosome (Chromosome object): With the added status (None or CN-AMP or CN-GAIN or CN-LOH)
    '''
    
    length_chromosome = chromosome.length
    
    if chromosome.number_cn_amp == 0 and chromosome.number_cn_gain == 0 and chromosome.number_cn_loh == 0:
        chromosome.cn_status = 'None'
        return chromosome
        
    cn_percentages = dict()
    
    if not chromosome.number_cn_amp == 0:
        chr_amp = chr_data[chr_data['Copy_Neutral'] == 'CN-AMP']
        total_length = chr_amp['Length'].sum()
        cn_percentages['CN-AMP'] = (total_length/length_chromosome) * 100
    
    if not chromosome.number_cn_gain == 0:
        chr_gain = chr_data[chr_data['Copy_Neutral'] == 'CN-GAIN']
        total_length = chr_gain['Length'].sum()
        cn_percentages['CN-GAIN'] = (total_length/length_chromosome) * 100
    
    if not chromosome.number_cn_loh == 0:
        chr_loh = chr_data[chr_data['Copy_Neutral'] == 'CN-LOH']
        total_length = chr_loh['Length'].sum()
        cn_percentages['CN-LOH'] = (total_length/length_chromosome) * 100
    
    max_percentage = max(cn_percentages.values())
    if max_percentage <= threshold:
        chromosome.cn_status = 'None'
        return chromosome
    else:            
        status = max(cn_percentages, key=lambda k: cn_percentages[k])
        chromosome.cn_status = status
    
    return chromosome

def createWholeChromosome(data, chromosome_data, mode, cytoband_data = None):
    '''
    Creates all chromosome and adds/calculates all attributes
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    chromosome_data (dataframe): Dataframe containing information about the chromosomes such as arm length ect. (look at Chromosome_data.ipynb)
    mode (string): Indicates what level should be calculated ('Chromosome', 'Cytoband', 'Arm', 'All')
    cytoband_data (dataframe): Cytoband info dataframe, default is None, so that if mode is not Cytoband or All, not input is needed.
    Output:
    chromosome_list (list of Chromosome objects)
    '''
    
    #chromosome_list = initiateAllChromosome(chromosome_data)
    
    #for i in range(0,len(chromosome_list)):
        #chromosome = chromosome_list[i]
        
    chromosome_list = list()
    
    chromosome_nr_list = list(chromosome_data['Chromosome'])
    
    for chr_nr in chromosome_nr_list:
        
        length = chromosome_data[chromosome_data['Chromosome'] == chr_nr]['length'].values[0]
        
        chromosome = Chromosome(chr_nr, length)
        
        #chromosome_list.append(chromosome)
        
        
        chr_data = filter_by_chromosome(data, chromosome.chromosome_number)
        
        chromosome = add_amp_del_cin_to_chromosome(chr_data, chromosome)
        
        chromosome = add_cn_numbers_to_chromosome(chr_data, chromosome)
        
        chromosome = compute_cn_status_chromosome(chr_data, chromosome, threshold = 90)
        
        if mode == 'Arm':
            
            arm_p, arm_q = createBothArms(data, chromosome_data, chromosome.chromosome_number)
            chromosome.arm_p = arm_p
            chromosome.arm_q = arm_q
            
        elif mode == 'Cytoband':
            
            cytoband_list = createAllCytoband_of_chr(data, cytoband_data, chromosome.chromosome_number)
            chromosome.cytobands = cytoband_list
            
        elif mode == 'All':
            
            arm_p, arm_q = createBothArms(data, chromosome_data, chromosome.chromosome_number)
            chromosome.arm_p = arm_p
            chromosome.arm_q = arm_q
            
            cytoband_list = createAllCytoband_of_chr(data, cytoband_data, chromosome.chromosome_number)
            chromosome.cytobands = cytoband_list
            
            
        #chromosome_list[i] = chromosome
        chromosome_list.append(chromosome)

    return chromosome_list


# Function Patient

In [None]:
def createAllPatients(files_list, chromosome_data, mode, path_to_files, cytoband_data = None):
    '''
    Creates a patient object for every sample file and add all attributes and the lower levels
    Input:
    files_list (list of strings): List containing the file names
    patient_list (list of Patient objects)
    chromosome_data (dataframe): Dataframe containing information about the chromosomes such as arm length ect. (look at Chromosome_data.ipynb)
    mode (string): Indicates what level should be calculated ('Chromosome', 'Cytoband', 'Arm', 'All')
    path_to_files (string): Path to the folder with the files in it
    cytoband_data (dataframe): Cytoband info dataframe, default is None, so that if mode is not Cytoband or All, not input is needed.
    Output:
    patient_list (Patient objects)
    '''
    patient_list = list()
    
    nr_files = len(files_list)
    
    i = 0
    
    for file in files_list:
        
        patient = Patient(file)
        
        data = loadPatientData(path_to_files, file)
        
        data = preparePatientData(data)
        
        chromosome_data = filterout_sex_chromosome(chromosome_data)
        
        chromosome_list = createWholeChromosome(data, chromosome_data, mode, cytoband_data)
        
        patient.chromosomes = chromosome_list
        
        patient = add_attributes_to_patient(patient)
        
        
        patient_list.append(patient)
        
        i = i+1
        
        if i%100 == 0:
            print('Progress: '+str(i)+'/'+str(nr_files)+ ' patients done')
        
    return patient_list



def add_attributes_to_patient(patient):
    '''
    Calculates the number of amplifications, deletions and the CIN for a given patient and adds it to the object.
    Additionally it adds the number of copy neutral events to the patient.
    Input:
    patient (Patient object)
    Output
    patient (Patient object): With the added amp, del, cin, n_cn_amp, n_cn_gain and n_cn_loh
    '''
    
    amplifications = 0
    deletions = 0
    cin = 0
    
    n_cn_amp = 0
    n_cn_gain = 0
    n_cn_loh = 0
    
    chr_list = patient.chromosomes
    
    for chromosome in chr_list:
        amplifications = amplifications + chromosome.amplifications
        deletions = deletions + chromosome.deletions
        
        cin = cin + chromosome.norm_cin
        
        n_cn_amp = n_cn_amp + chromosome.number_cn_amp
        n_cn_gain = n_cn_gain + chromosome.number_cn_gain
        n_cn_loh = n_cn_loh + chromosome.number_cn_loh
    
    patient.amplifications = amplifications
    patient.deletions = deletions
    patient.cin = cin
    
    patient.number_cn_amp = n_cn_amp
    patient.number_cn_gain = n_cn_gain
    patient.number_cn_loh = n_cn_loh
    
    
    return patient

def loadPatientData(path_to_files, filename):
    '''
    Loads the allelic specific copy number data for a given patient
    Input:
    path_to_files (string): Path to the folder with the files in it
    filename (string): Allelic specific copy number file name
    Output:
    (dataframe): Allelic specific copy number dataframe of one sample
    '''
    return pd.read_csv(path_to_files+filename, sep='\t', header = 0)

def preparePatientData(data):
    '''
    Filters out the sex chromosomes from the dataframe and adds the length of the segement and the Copy neutral state information
    Input:
    data (dataframe): Allelic specific copy number dataframe of one sample
    Output:
    data (dataframe): Input data, filtered and with added columns
    '''
    data_filtered = filterout_sex_chromosome(data)
    data_length = annotateSegmentLength(data_filtered.copy())
    data_cn = annotateSegmentCopyNeutralState(data_length.copy())
    return data_cn


def addHRDResultsToPatient(patient_list, HRD_results):
    '''
    Adds the HRD results to each patient
    Input
    patient_list (list of Patient objects)
    HRD_results (dataframe): Contains the HRD scores
    Output:
    patient_list (list of Patient objects)
    '''
    for i in range(0,len(patient_list)):
        patient = patient_list[i]
        hrd = HRD_results[HRD_results['File Name'] == patient.name]['HRD_sum'].values[0]
        lst = HRD_results[HRD_results['File Name'] == patient.name]['LST'].values[0]
        loh = HRD_results[HRD_results['File Name'] == patient.name]['LOH'].values[0]
        tai = HRD_results[HRD_results['File Name'] == patient.name]['TAI'].values[0]
        
        patient.HRD = hrd
        patient.LOH = loh
        patient.LST = lst
        patient.TAI = tai
        
        patient_list[i] = patient
        
    return patient_list

# General functions

In [None]:
def loadData(chromosome_data_path, cytoband_data_path, HRD_results_path):
    '''
    Load needed data such as chromosome data, cytoband data, and the HRD results (only using primary samples)
    Input:
    chromosome_data_path (string): Path to chromosome data
    cytoband_data_path (string): Path to cytoband data
    HRD_results_path (string): Path to HRD results
    Output:
    chromosome_data (dataframe)
    cytobands_data (dataframe)
    HRD_results_primary (dataframe)
    '''
    chromosome_data = pd.read_csv(chromosome_data_path, sep=',', header = 0, index_col = 0)
    cytobands_data = pd.read_csv(cytoband_data_path, sep=',', header = 0, index_col = 0)
    HRD_results = pd.read_csv(HRD_results_path, sep=',', header = 0)
    HRD_results_primary = HRD_results[HRD_results['Type'] == 'Primary']
    
    return chromosome_data, cytobands_data, HRD_results_primary



def writeResults(patient_list, project_id, mode, output_path):
    '''
    Writes the results into tables. For each level (Patient, Chromosome, Arm, Cytoband) a table is created, which depends on the mode.
    Input:
    patient_list (list of Patients objects)
    project_id (string): Cancer type/ Project ID in the from of 'TCGA-LUAD'
    output_path (string): Path to the result folder
    '''
    
    chromosome_data = []
    arm_data = []
    cytoband_data = []
    
    attributes = ['length', 'amplifications', 'deletions', 'cin', 'norm_cin', 'cn_status',
                  'number_cn_amp', 'number_cn_gain', 'number_cn_loh']
    
    df_patient = pd.DataFrame(columns=['File Name',
                                      'cin',
                                      'amplifications',
                                      'deletions',
                                      'n_cn_amp',
                                      'n_cn_gain',
                                      'n_cn_loh',
                                      'HRD_sum',
                                      'LST',
                                      'LOH',
                                      'TAI'])
    
    for patient in patient_list:
        ## Add new patient
        df_patient.loc[len(df_patient)] = [patient.name,
                                          patient.cin,
                                          patient.amplifications,
                                          patient.deletions,
                                          patient.number_cn_amp,
                                          patient.number_cn_gain,
                                          patient.number_cn_loh,
                                          patient.HRD,
                                          patient.LST,
                                          patient.LOH,
                                          patient.TAI]
        
        
        patient_data_chromosome = {'File Name': patient.name}
        
        ## Create dicts depending on the mode
        if mode == 'Arm':
            patient_data_arm = {'File Name': patient.name}
        elif mode == 'Cytoband':
            patient_data_cytoband = {'File Name': patient.name}
        elif mode == 'All':
            patient_data_arm = {'File name': patient.name}
            patient_data_cytoband = {'File Name': patient.name}
            

        for chromosome in patient.chromosomes:
            
            ## Add chromosome results
            for attribute in attributes:
                column_name = f"{chromosome.chromosome_number}_{attribute}"
                patient_data_chromosome[column_name] = getattr(chromosome, attribute)
            
            ## Add arm results
            if mode == 'Arm':
                for arm_label in ['p', 'q']:
                    arm = getattr(chromosome, f"arm_{arm_label}")
            
                    for attribute in attributes:
                        column_name_arm = f"{arm.chromosome_arm}_{attribute}"
                        patient_data_arm[column_name_arm] = getattr(arm, attribute)
                        
            ## Add cytoband results            
            elif mode == 'Cytoband':
                
                for cytoband in chromosome.cytobands:
                    for attribute in attributes:
                        column_name_cytoband = f"{cytoband.chromosome_cyto}_{attribute}"
                        patient_data_cytoband[column_name_cytoband] = getattr(cytoband, attribute)
            ## Add arm and cytoband results
            elif mode == 'All':
                
                for arm_label in ['p', 'q']:
                    arm = getattr(chromosome, f"arm_{arm_label}")
            
                    for attribute in attributes:
                        column_name_arm = f"{arm.chromosome_arm}_{attribute}"
                        patient_data_arm[column_name_arm] = getattr(arm, attribute)
                        
                for cytoband in chromosome.cytobands:
                    for attribute in attributes:
                        column_name_cytoband = f"{cytoband.chromosome_cyto}_{attribute}"
                        patient_data_cytoband[column_name_cytoband] = getattr(cytoband, attribute)
                        
        chromosome_data.append(patient_data_chromosome)
        if mode == 'Arm':
            arm_data.append(patient_data_arm)
        elif mode == 'Cytoband':
            cytoband_data.append(patient_data_cytoband)
        elif mode == 'All':
            arm_data.append(patient_data_arm)
            cytoband_data.append(patient_data_cytoband)
    
    ## Create the dataframes and save them
    df_chr = pd.DataFrame(chromosome_data)
    
    df_patient.to_csv(output_path+'/'+project_id+'_level_patient.csv', sep=',', header = True)
    df_chr.to_csv(output_path+'/'+project_id+'_level_chromosome.csv', sep=',', header = True)
    
    if mode == 'Arm':
        
        df_arm = pd.DataFrame(arm_data)
        df_arm.to_csv(output_path+'/'+project_id+'_level_arms.csv', sep=',', header = True)
        
    elif mode == 'Cytoband':
        
        df_cyto = pd.DataFrame(cytoband_data)
        df_cyto.to_csv(output_path+project_id+'_level_cytobands.csv', sep=',', header = True)
        
    elif mode == 'All':
        
        df_arm = pd.DataFrame(arm_data)
        df_arm.to_csv(output_path+'/'+project_id+'_level_arms.csv', sep=',', header = True)
        df_cyto = pd.DataFrame(cytoband_data)
        df_cyto.to_csv(output_path+'/'+project_id+'_level_cytobands.csv', sep=',', header = True)
        

def checkFolder(path):
    '''
    Checks if directory exits
    Input
    path (string)
    Output:
    (boolean)
    '''
    return (os.path.exists(path) and os.path.isdir(path))

def run_CIN_pipeline(path_to_files, chromosome_data_path, cytoband_data_path, HRD_results_path, mode, output_path, version, selection = []):
    
    '''
    Function to run the pipeline.
    Input:
    path_to_files (string): Path to the folder with the allelic specific copy number files
    chromosome_data_path (string): Path to chromosome data
    cytoband_data_path (string): Path to cytoband data
    HRD_results_path (string): Path to HRD results
    mode (string): Indicates for which levels results should be computed. Possible mode are 'Chromosome', 'Arm', 'Cytoband', 'All'.
                    Every level includes Patient and Chromosome output ('Chromosome' is for only Patient + Chromosome level)
    output_path (string): Path to the folder at which location the result folder is created
    version (string): Indicates the run, if the same version is run twice it check for each cancertype if there are results and if there are it skips it.
    selection (list of strings): Default is empty, if given a list of project IDs (like 'TCGA-LUAD') only the selected cohort are computed
    '''
    
    ## Load data
    chromosome_data, cytoband_data, HRD_results = loadData(chromosome_data_path, cytoband_data_path, HRD_results_path)
    
    ## Get Project IDs
    cancertypes = np.unique(HRD_results['Project ID'])
    
    ## Path to result folder
    mainresults_path = output_path + '/CIN_Output_version_'+ version
    
    ## Creates the results folder if not there
    if not checkFolder(mainresults_path):
        os.makedirs(mainresults_path)
    
    ## If only selection of cohort should be run, change the cancertype list to the selection
    if not len(selection) == 0:
        cancertypes = selection
    
    ## Dict to save the times need to run a cohort
    time_dict = dict()
    
    for cancertype in cancertypes:
        
        ## Creates the results subfolder for a cancertype if there is non
        ## If there is one and files are in there, the cancertype is skipped
        output_path_cancertype = mainresults_path + '/' + cancertype
        if checkFolder(output_path_cancertype):
            if len(os.listdir(output_path_cancertype)) > 0:
                print(cancertype + ' already computed')
                continue
        else:
            os.makedirs(output_path_cancertype)
        
        ## Start timer
        start_time = time.time()
        
        print('Start of cohort '+ cancertype)
        
        ## Subset HRD results
        HRD_results_sub = HRD_results[HRD_results['Project ID'] == cancertype]
        
        ## Get the file names of the samples
        files = list(HRD_results_sub['File Name'])
        
        ## Create all patients object for each sample (main computation)
        patient_list = createAllPatients(files, chromosome_data, mode, path_to_files, cytoband_data)
        
        ## Add HRD results
        patient_list = addHRDResultsToPatient(patient_list, HRD_results_sub)
        
        print('Finished computation, start with writing the results into tables')
        
        ## Wrtie the results to tables
        writeResults(patient_list, cancertype , mode, output_path_cancertype)
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        ## Save needed time
        time_dict[cancertype] = execution_time
        
        print('Finished cohort '+ cancertype +' in '+str(execution_time))
    
    time_df = pd.DataFrame.from_dict(time_dict, orient='index', columns=['Time in s'])
    
    ### Rethink this
    ## Check if time already has been measured for this version, and if that is so, add new and old times together
    if os.path.exists(mainresults_path+'/timetable_'+version+'.csv'):
        old_results = pd.read_csv(mainresults_path+'/timetable_'+version+'.csv',index_col = 0)
        merged_times = pd.concat([old_results, time_df])
        merged_times.to_csv(mainresults_path+'/timetable_'+version+'.csv', sep=',', header = True)
    else:
        time_df.to_csv(mainresults_path+'/timetable_'+version+'.csv', sep=',', header = True)