In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# find_isomers(df, column, threshhold)

In [None]:
def find_isomers(df: pd.DataFrame, column: str, threshhold: float):
    '''
    finds peptides which are closer/as close to each other than the threshsold in the given column;
    returns dataframe with all isomers and set column
    '''
    
    df = df.drop_duplicates(subset=['Sequence', column], ignore_index=True)
    df = df.sort_values(by=column, ignore_index=True)
    mass_diff = df[column].diff().shift(-1)
    mask = mass_diff <= threshhold
    mask_shift = mask.shift(1)
    mask_shift = mask_shift.fillna(False)
    mask_join = mask | mask_shift
    isomers_df = df[mask_join]
    # make set column
    counter = 0
    new_col = []
    for index in mask.index:
        if mask[index] == True:
            new_col.append(counter)        
            if mask[index+1] == False:
                new_col.append(counter)
                counter+=1
    isomers_df.insert(loc = 1, column='Set',value = new_col)
    return isomers_df

# plot_scatter(df, x, y, c, (figsize))

In [None]:
def plot_scatter(df: pd.DataFrame, x: str, y: str, c: str, figsize=(10,6)):     
    # makes scatter plot
    fig, ax = plt.subplots(figsize=figsize)   
    legend_labels = []
    for category, group in df.groupby(c):
        ax.scatter(group[x], group[y], label=category)
        legend_labels.append(category)

    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.legend(title = c,labels=legend_labels[:3*figsize[1]],  bbox_to_anchor=(1.0, 1.0), loc='upper left', ncols = 1)
    if len(legend_labels)> (3*figsize[1]):
        print(f'{len(legend_labels)} Labels, but only {3*figsize[1]} shown')

    plt.show()

# get_peptides_across_many_fractions(df, cutoff, cat)

In [None]:
def get_peptides_across_many_fractions(df:pd.DataFrame, cutoff:int, cat = 'Sequence'):
    '''
    cat: category for grouping
    ------
    returns a dataframe with peptides which appear in more fractions than the cutoff
    same columns as in df before, added Color column
    '''
    # over different fractions
    df_drop = df.drop_duplicates(subset=['Sequence', 'Fraction', 'Experiment', 'Proteins'], ignore_index=True)
    df_drop = df_drop[['Sequence', 'Fraction', 'Experiment', 'Proteins']]

    # make new table with unique sequence, in how many fractions and how many experiments
    df_unique = df_drop[['Sequence', 'Fraction', 'Experiment']]
    df_unique = df_unique.groupby(cat).agg({'Experiment': 'nunique', 'Fraction': 'nunique'}).reset_index()

    # Rename the columns for clarity
    df_unique.columns = [cat, 'num_experiments', 'num_fractions']
    top_ten_seqs = df_unique[df_unique['num_fractions']>=cutoff][[cat]].values.flatten()
    df_top = df[df[cat].isin(top_ten_seqs)]
    df_top['Color'] = pd.Categorical(df_top[cat]).codes
    return df_top

# difference_mass_modifications(df, modification, property)

In [None]:
def difference_mass_modification(df: pd.DataFrame, modification: str, property: str, threshhold=10000):
    '''
    Calculates the Difference of Peptides with the specified Modification to the Mean Value of its unmodifed Version
    '''
    df_mod = df[df['Modifications'] == modification]
    df_mod = df_mod.dropna(subset=[property])

    if len(df_mod)>threshhold:
        df_mod = df_mod.sample(n = threshhold, random_state=42)
    
    # create dataframe with sequence-> mean of property over sequence
    df_unmodified = df[(df['Sequence'].isin(df_mod['Sequence'].tolist())&(df['Modifications']=='Unmodified'))][['Sequence',property]]
    grouped_avg = df_unmodified.groupby('Sequence')[property].mean()
    grouped_avg_df = grouped_avg.reset_index()

    # make difference column
    merged_df = pd.merge(df_mod, grouped_avg_df[['Sequence', property]], on='Sequence', suffixes=('_mod', '_unmod'), how='inner')

    # Calculate difference and add as a new column in df1
    merged_df['Difference'] = merged_df[f'{property}_mod'] - merged_df[f'{property}_unmod']   

    return merged_df