# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%run methods_freshfrozen.ipynb

In [None]:
df = pd.read_csv("evidence_freshfrozen.txt",  sep="\t")

# Data Exploration

In [None]:
# subset dataset
df_51 = df[df["Experiment"]=='P064051']
df_64 = df[df["Experiment"]=='P064064']
df_28 = df[df["Experiment"]=='P064428']

## General

In [None]:
# sequence length vs CCS
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(group['Mass'], group['CCS'], label=name, alpha = 0.1)
plt.xlabel('Mass')
plt.ylabel('CCS')
plt.legend(title='Charge')
plt.show()

In [None]:
# mass vs 1/K0
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(group['Mass'], group['1/K0'], label=name, alpha = 0.1, cmap = 'viridis')
plt.xlabel('Mass')
plt.ylabel('1/K0')
plt.legend(title='Charge')
plt.show()

In [None]:
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(y = group['1/K0'], x = group['m/z'], label=name, alpha = 0.1)
plt.ylabel('1/K0')
plt.xlabel('m/z')
plt.legend(title='Charge')
plt.show()


In [None]:
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(y = group['1/K0'], x = group['CCS'], label=name, alpha = 0.1)
plt.ylabel('1/K0')
plt.xlabel('CCS')
plt.legend(title='Charge')
plt.show()

In [None]:
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(y = group['Retention time'], x = group['Mass'], label=name, alpha = 0.1)
plt.ylabel('Retention time')
plt.xlabel('Mass')
plt.legend(title='Charge')
plt.show()

In [None]:
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(y = group['Retention time'], x = group['1/K0'], label=name, alpha = 0.1)
plt.ylabel('Retention time')
plt.xlabel('1/K0')
plt.legend(title='Charge')
plt.show()

In [None]:
groups = df.groupby('Charge')
for name, group in groups:
    plt.scatter(y = group['Retention time'], x = group['Charge'], label=name, alpha = 0.1)
plt.ylabel('Retention time')
plt.xlabel('Charge')
plt.legend(title='Charge')
plt.show()

In [None]:
plt.hist(df['Retention time'], bins = 30)
plt.title('Retention time')
plt.xlabel('Retention time')
plt.show()

In [None]:
#TODO: How many Datatpoint over all Fractions
df_51 = df[df["Experiment"]=='P064051']
plt.hist(df_51['Fraction'], alpha = 0.5, bins = 48)[2]

df_64 = df[df["Experiment"]=='P064064']
plt.hist(df_64['Fraction'], alpha = 0.5, bins = 48)[2]

df_28 = df[df["Experiment"]=='P064428']
plt.hist(df_28['Fraction'], alpha = 0.5, bins = 48)[2]

In [None]:
im_range = df['1/K0'].max()-df['1/K0'].min()

## Isomers

In [None]:
isomers_df = find_isomers(df, 'm/z', 0.0)

In [None]:
isomers_df.plot.scatter(x= 'm/z', y = '1/K0', c = 'Charge', figsize = (20,8), cmap = 'viridis', alpha = 0.5)

### RT Difference

In [None]:
# Check RT for all isomers
im_diff_df = isomers_df[['Set', 'Retention time', 'Fraction','Experiment']]
grouped_diff_df = im_diff_df.groupby('Set')['Retention time'].max()- im_diff_df.groupby('Set')['Retention time'].min()
grouped_diff_df = grouped_diff_df.reset_index()
plt.hist(x= grouped_diff_df['Retention time'], bins = 100)[2]
plt.title('RT Difference per Set')
plt.xlabel('RT Difference')

In [None]:
isomers_rt = isomers_df[(isomers_df['Set'].isin(grouped_diff_df[grouped_diff_df['Retention time']<=1.80]['Set']))]

### IM Difference

In [None]:
isomers_rt_small = isomers_rt[['Sequence','Set','Fraction', 'Experiment','Charge', 'm/z', 'Retention time', '1/K0', 'Score', 'Delta score']]

In [None]:
isomers_rt.plot.scatter(x = 'Retention time', y= '1/K0', c = 'Charge', figsize = (20,8), colormap = 'gist_rainbow')

In [None]:
# Check RT for all isomers
im_diff_df = isomers_rt[['Set', '1/K0', 'Fraction','Experiment']]
grouped_diff_df = im_diff_df.groupby('Set')['1/K0'].max()- im_diff_df.groupby('Set')['1/K0'].min()
grouped_diff_df = grouped_diff_df.reset_index()
plt.hist(x=( grouped_diff_df['1/K0']/im_range), bins = 100)[2]
plt.title('1/K0 Difference per Set')
plt.xlabel('Relative 1/K0 Difference')

In [None]:
small_im_diff = grouped_diff_df[grouped_diff_df['1/K0']<0.06]
im_diff_df = isomers_df[isomers_df['Set'].isin(small_im_diff['Set'])][['Sequence','Set', 'Fraction','Experiment', 'Modifications','Charge','m/z','Retention time', '1/K0', 'CCS', 'Score', 'Delta score']]
print(len(small_im_diff))

#### CCS Variance

In [None]:
# find sequences which are the same and are in the same fraction and experiment
rt_isomers = df.groupby(['Modified sequence', 'Fraction', 'Experiment'])
rt_isomers = rt_isomers.filter(lambda x: len(x) > 1)
rt_isomers = 

In [None]:
pairs = same_rt_std[same_rt_std['1/K0']==0]['Set']
print(same_rt[same_rt['Set'].isin(pairs.tolist())][['Sequence','Fraction','Charge' , 'Modifications','Experiment', 'm/z', '1/K0','CCS', 'PEP','Score' ,'Delta score']])

## Modifications

In [None]:
df_mod = df.drop_duplicates(subset =  ['Sequence', 'Modified sequence', 'Modifications'])
df_mod = df_mod[df_mod['Modified sequence'].notna()]
df_mod.plot.scatter(x = 'Retention time', y = '1/K0', c = pd.Categorical(df_mod['Modifications']).codes, colormap = 'viridis', alpha = 0.5)
df_mod.plot.scatter(x = 'Fraction', y = '1/K0', c = pd.Categorical(df_mod['Modifications']).codes, colormap = 'viridis', alpha = 0.5)
df_mod.plot.scatter(x = 'Length', y = '1/K0', c = pd.Categorical(df_mod['Modifications']).codes, colormap = 'viridis', alpha = 0.5)
df_mod.plot.scatter(x = 'Mass', y = 'Modifications',colormap = 'viridis', alpha = 0.5)

In [None]:


df_un = difference_mass_modification2(df, 'Unmodified', '1/K0', 1000)
df_ac = difference_mass_modification2(df, 'Acetyl (Protein N-term)', '1/K0',1000)
df_ac_ox = difference_mass_modification2(df, 'Acetyl (Protein N-term),Oxidation (M)', '1/K0', 1000)
df_ac_2ox = difference_mass_modification2(df, 'Acetyl (Protein N-term),2 Oxidation (M)', '1/K0')
df_ac_3ox = difference_mass_modification2(df, 'Acetyl (Protein N-term),3 Oxidation (M)', '1/K0')
df_ox = difference_mass_modification2(df, 'Oxidation (M)', '1/K0', 1000)
df_ox2 = difference_mass_modification2(df, '2 Oxidation (M)', '1/K0', 1000)
df_ox3 = difference_mass_modification2(df, '3 Oxidation (M)', '1/K0')
df_ox4 = difference_mass_modification2(df, '4 Oxidation (M)', '1/K0')
df_ox5 = difference_mass_modification2(df, '5 Oxidation (M)', '1/K0')

In [None]:
df_ox = difference_mass_modification2(df, 'Oxidation (M)', 'Mass', 1000)
print(df_ox[['Sequence', 'Mass_mod', 'Mass_unmod', 'Difference']])

In [None]:
df_ac = difference_mass_modification2(df, 'Acetyl (Protein N-term)', '1/K0')
plt.hist(x = (df_ac['Difference']/0.5))

In [None]:
df_ox = difference_mass_modification2(df, 'Oxidation (M)', '1/K0')


In [None]:

plt.hist(x = (df_ox['Difference']/0.5))

In [None]:
%run methods_freshfrozen.ipynb
df_list = [df_un, df_ac, df_ac_ox, df_ac_2ox, df_ac_3ox, df_ox,df_ox2, df_ox3, df_ox4, df_ox5]
str_list = ['Unmodified', 'Acetyl (Protein N-term)', 'Acetyl (Protein N-term),Oxidation (M)', 'Acetyl (Protein N-term),2 Oxidation (M)', 'Acetyl (Protein N-term),3 Oxidation (M)', 'Oxidation (M)',
            '2 Oxidation (M)', '3 Oxidation (M)', '4 Oxidation (M)', '5 Oxidation (M)']
prop_list = ['Mass', 'Retention time', '1/K0']
diff = 'Difference'
for prop in prop_list:
    print(f'Difference in {prop}')
    counter = 0
    for df_diff in df_list:
        df_diff = difference_mass_modification2(df, str_list[counter], prop)
        print(f'{str_list[counter]}: {df_diff[diff].mean()}, {df_diff[diff].std()}')
        counter +=1
    print()


In [None]:
plt.hist(x = df_ac['Difference'])[2]
plt.hist(x = df_ac_ox['Difference'])[2]
plt.hist(x = df_ac_2ox['Difference'])[2]
plt.hist(x = df_ac_3ox['Difference'])[2]
plt.hist(x = df_ox['Difference'])[2]
plt.hist(x = df_ox2['Difference'])[2]
plt.hist(x = df_ox3['Difference'])[2]
plt.hist(x = df_ox4['Difference'])[2]
plt.hist(x = df_ox5['Difference'])[2]

In [None]:
for df in df_list:
    print(len(df))

## Matrix Effect

### Peptides in many fractions

#### Experiment 51

In [None]:
# look at proteins
# over different fractions
df_proteins_51 = df_51.sort_values(by='Proteins', ignore_index=True)
df_proteins_51 = df_proteins_51.drop_duplicates(subset=['Sequence', 'Fraction', 'Experiment', 'Proteins'], ignore_index=True)
df_proteins_51 = df_proteins_51[['Sequence', 'Fraction', 'Experiment', 'Proteins']]

# make new table with unique sequence, in how many fractions and how many experiments
df_unique_seq_51 = df_proteins_51[['Sequence', 'Fraction', 'Experiment']]

df_unique_seq_51 = df_unique_seq_51.groupby('Sequence').agg({'Experiment': 'nunique', 'Fraction': 'nunique'}).reset_index()

# Rename the columns for clarity
df_unique_seq_51.columns = ['Sequence', 'num_experiments', 'num_fractions']


In [None]:

df_unique_seq_51.plot.bar(x='Sequence', y='num_fractions', figsize = (20,8))
# which proteins appear in how many fractions

In [None]:
df_top_ten_51= get_peptides_across_many_fractions(df_51, 15)

plot_scatter( df_top_ten_51,y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_51, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_51, y='Fraction', x= '1/K0', c='Sequence')

plot_scatter(df_top_ten_51, y='Fraction', x= 'Calibrated 1/K0', c='Sequence')

In [None]:
df_top_ten_51= get_peptides_across_many_fractions(df_51, 15)

plot_scatter( df_top_ten_51[df_top_ten_51['Sequence'].isin(top_5_seqs)] ,y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_51[df_top_ten_51['Sequence'].isin(top_5_seqs)], y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_51[df_top_ten_51['Sequence'].isin(top_5_seqs)], y='Fraction', x= '1/K0', c='Sequence')

In [None]:
pivot_table = df_top_ten_51.pivot_table(index='Sequence', columns='Charge', aggfunc='size', fill_value=0)
ax = pivot_table.plot.bar(stacked=True)
plt.show()

#### Variance in Values

In [None]:
df_top_ten_51.boxplot(by=['Color','Charge'], column='Mass')
df_top_ten_51.boxplot(by=['Color','Charge'], column='Retention time')
df_top_ten_51.boxplot(by=['Color','Charge'], column='1/K0')
df_top_ten_51.boxplot(by=['Color','Charge'], column='Calibrated 1/K0')
df_top_ten_51.boxplot(by=['Color'], column='1/K0')
df_top_ten_51.boxplot(by=['Color','Charge'], column='m/z')
df_top_ten_51.boxplot(by=['Color','Charge'], column='CCS')

##### Investigate Seq 3_3(highest IM Variance)

In [None]:
df_seq_51_3_3 =df_top_ten_51[(df_top_ten_51['Color'] == 3) &( df_top_ten_51['Charge'] == 3)]
plot_scatter(df_seq_51_3_3, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_seq_51_3_3, y='Fraction', x= '1/K0', c='Sequence')

#### Experiment 64

In [None]:
# look at proteins
# over different fractions
df_proteins_64 = df_64.sort_values(by='Proteins', ignore_index=True)
df_proteins_64 = df_proteins_64.drop_duplicates(subset=['Sequence', 'Fraction', 'Experiment', 'Proteins'], ignore_index=True)
df_proteins_64 = df_proteins_64[['Sequence', 'Fraction', 'Experiment', 'Proteins']]

# make new table with unique sequence, in how many fractions and how many experiments
df_unique_seq_64 = df_proteins_64[['Sequence', 'Fraction', 'Experiment']]

df_unique_seq_64 = df_unique_seq_64.groupby('Sequence').agg({'Experiment': 'nunique', 'Fraction': 'nunique'}).reset_index()

# Rename the columns for clarity
df_unique_seq_64.columns = ['Sequence', 'num_experiments', 'num_fractions']


In [None]:
df_unique_seq_64.plot.bar(x='Sequence', y='num_fractions', figsize = (20,8))
# which proteins appear in how many fractions

In [None]:
# explore sequences which appear in many fractions
df_top_ten_64 = get_peptides_across_many_fractions(df_64, 15)
plot_scatter( df_top_ten_64,y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_64, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_64, y='Fraction', x= '1/K0', c='Sequence')

In [None]:
df_top_ten_64 = get_peptides_across_many_fractions(df_64, 15)
plot_scatter( df_top_ten_64[df_top_ten_64['Sequence'].isin(top_5_seqs)],y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_64[df_top_ten_64['Sequence'].isin(top_5_seqs)], y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_64[df_top_ten_64['Sequence'].isin(top_5_seqs)], y='Fraction', x= '1/K0', c='Sequence')

In [None]:
pivot_table = df_top_ten_64.pivot_table(index='Color', columns='Charge', aggfunc='size', fill_value=0)
ax = pivot_table.plot.bar(stacked=True)
plt.show()

In [None]:
df_top_ten_64.boxplot(by=['Color','Charge'], column='Mass')
df_top_ten_64.boxplot(by=['Color','Charge'], column='Retention time')
df_top_ten_64.boxplot(by=['Color','Charge'], column='1/K0')
df_top_ten_64.boxplot(by=['Color'], column='1/K0')
df_top_ten_64.boxplot(by=['Color','Charge'], column='m/z')
df_top_ten_64.boxplot(by=['Color','Charge'], column='CCS')

##### Investigate Seq 4_3(highest IM Variance)

In [None]:
df_seq_64_3_3 =df_top_ten_64[(df_top_ten_64['Color'] == 3) &( df_top_ten_64['Charge'] == 3)]
plot_scatter(df_seq_64_3_3, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_seq_64_3_3, y='Fraction', x= '1/K0', c='Sequence')

#### Experiment 28

In [None]:
# look at proteins
# over different fractions
df_proteins_28 = df_28.sort_values(by='Proteins', ignore_index=True)
df_proteins_28 = df_proteins_28.drop_duplicates(subset=['Sequence', 'Fraction', 'Experiment', 'Proteins'], ignore_index=True)
df_proteins_28 = df_proteins_28[['Sequence', 'Fraction', 'Experiment', 'Proteins']]

# make new table with unique sequence, in how many fractions and how many experiments
df_unique_seq_28 = df_proteins_28[['Sequence', 'Fraction', 'Experiment']]

df_unique_seq_28 = df_unique_seq_28.groupby('Sequence').agg({'Experiment': 'nunique', 'Fraction': 'nunique'}).reset_index()

# Rename the columns for clarity
df_unique_seq_28.columns = ['Sequence', 'num_experiments', 'num_fractions']


In [None]:
df_unique_seq_28.plot.bar(x='Sequence', y='num_fractions', figsize = (20,8))
# which proteins appear in how many fractions

In [None]:
df_top_ten_28 = get_peptides_across_many_fractions(df_28, 15)

plot_scatter( df_top_ten_28,y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_28, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_28, y='Fraction', x= '1/K0', c='Sequence')

In [None]:
df_top_ten_28 = get_peptides_across_many_fractions(df_28, 15)
plot_scatter( df_top_ten_28[df_top_ten_28['Sequence'].isin(top_5_seqs)],y='Fraction', x= 'Mass', c='Sequence')
plot_scatter(df_top_ten_28[df_top_ten_28['Sequence'].isin(top_5_seqs)], y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_top_ten_28[df_top_ten_28['Sequence'].isin(top_5_seqs)], y='Fraction', x= '1/K0', c='Sequence')

In [None]:
pivot_table = df_top_ten_28.pivot_table(index='Sequence', columns='Charge', aggfunc='size', fill_value=0)
ax = pivot_table.plot.bar(stacked=True)
plt.show()

In [None]:
df_top_ten_28.boxplot(by=['Color','Charge'], column='Mass')
df_top_ten_28.boxplot(by=['Color','Charge'], column='Retention time')
df_top_ten_28.boxplot(by=['Color','Charge'], column='1/K0')
df_top_ten_28.boxplot(by=['Color'], column='1/K0')
df_top_ten_28.boxplot(by=['Color','Charge'], column='m/z')
df_top_ten_28.boxplot(by=['Color','Charge'], column='CCS')

##### Investigate Seq 3_3(highest IM Variance)

In [None]:
df_seq_28_3_3 =df_top_ten_28[(df_top_ten_28['Color'] == 3) &( df_top_ten_28['Charge'] == 3)]
plot_scatter(df_seq_28_3_3, y='Fraction', x= 'Retention time', c='Sequence')
plot_scatter(df_seq_28_3_3, y='Fraction', x= '1/K0', c='Sequence')

In [None]:
seqs_51 = df_top_ten_51['Sequence'].unique().tolist()
seqs_64 = df_top_ten_64['Sequence'].unique().tolist()
seqs_28 = df_top_ten_28['Sequence'].unique().tolist()
seqs = [seqs_51, seqs_64, seqs_28]

print(seqs_51)
print(seqs_64)
print(seqs_28)

print(len(seqs_51))
print(len(seqs_64))
print(len(seqs_28))

seqs = seqs_51 + seqs_64 + seqs_28
value_counts = {}
for seq in seqs:
    if seq in value_counts:
        value_counts[seq] += 1
    else:
        value_counts[seq] = 1
value_counts = sorted(value_counts.items(), key=lambda x:x[1], reverse=True)

# Print the counts for each value
for value, count in value_counts:
    print(f"{value}: {count} times")

top_5_seqs = ['AAAIGIDLGTTYSCVGVFQHGK', 'AADFQLHTHVNDGTEFGGSIYQK', 'AAEAAAAPAESAAPAAGEEPSKEEGEPK', 'AAHVFFTDSCPDALFNELVK', 'AALDGTPGMIGYGMAK']


## MaxQuant Search

### PEP

In [None]:
print(df['PEP'].min())
print(df['PEP'].max())
plt.hist(df['PEP'], bins = 40, range= (0,0.15))

In [None]:
plt.hist(df['PEP'], bins = 40, range= (0.01,0.15))

#### Score

In [None]:
print(df['Score'].min())
print(df['Score'].max())
plt.hist(df['Score'], bins = 40,)

### Delta Score

In [None]:
print(df['Delta score'].min())
print(df['Delta score'].max())
plt.hist(df['Delta score'], bins = 40)