In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
UCSC_path = '/home/angelosmath/MSc/thesis_ppi_mean/data/UCSC/gg_ppi.csv'
STRING_path = '/home/angelosmath/MSc/thesis_ppi_mean/mapping/STRING_gene.csv'

In [31]:
string_df = pd.read_csv(STRING_path, low_memory=False)
ucsc_df = pd.read_csv(UCSC_path, low_memory=False)

In [32]:
print(string_df.shape)

print(ucsc_df.shape)

(13343180, 3)
(715308, 3)


In [33]:
ucsc_genes = set(ucsc_df['gene1'].unique()).intersection(ucsc_df['gene2'].unique())
string_genes = set(string_df['gene1'].unique()).intersection(string_df['gene2'].unique())


print(f'UCSC unique Genes: {len(list(string_genes))}')
print(f'String unique Genes: {len(list(ucsc_genes))}')


common_genes = list(ucsc_genes.intersection(string_genes))

print(f'Common genes: {len(common_genes)}')

UCSC unique Genes: 11743
String unique Genes: 19074
Common genes: 10879


In [5]:
# Extract unique gene pairs from each DataFrame
string_gene_pairs = set(zip(STRING_df['gene1'], STRING_df['gene2']))
ucsc_gene_pairs = set(zip(UCSC_df['gene1'], UCSC_df['gene2']))

In [9]:
common_pairs = string_gene_pairs.intersection(ucsc_gene_pairs)

#common_pairs_df = pd.DataFrame(list(common_pairs), columns=['gene1', 'gene2'])

print(f'common pairs of genes: {len(common_pairs)}')

common pairs of genes: 234807


In [10]:
# Find gene pairs that exist in STRING_df but not in UCSC_df
unique_pairs_in_string = string_gene_pairs - ucsc_gene_pairs

print("Number of gene pairs unique to STRING_df:", len(unique_pairs_in_string))

unique_STRING_df = pd.DataFrame(list(unique_pairs_in_string), columns=['gene1', 'gene2'])

unique_STRING_df = STRING_df[STRING_df[['gene1', 'gene2']].apply(tuple, axis=1).isin(unique_STRING_df.apply(tuple, axis=1))]

Number of gene pairs unique to STRING_df: 13091674


In [16]:
score = 400

threshold_positive = unique_STRING_df[unique_STRING_df['combined_score'] > score]

print(f'found {threshold_positive.shape[0]} gene paris with score greater than {score}')

found 1706924 gene paris with score greater than 400


In [18]:
n = 100 # sample a number of pairs 

positive_samples = threshold_positive.sample(n)

In [29]:
random_genes = pd.Series()

In [30]:
while len(random_genes) < n:
    gene = pd.Series(list(common_genes - set(random_genes))).sample(1).iloc[0]
    
    if all((gene not in pair) for pair in string_gene_pairs) and all((gene not in pair) for pair in ucsc_gene_pairs):
        random_genes = random_genes.append(pd.Series([gene]), ignore_index=True)


TypeError: unsupported operand type(s) for -: 'list' and 'set'

In [23]:
UCSC_unique = set(UCSC_df['gene1'].unique()).intersection(UCSC_df['gene2'].unique())
STRING_unique = set(STRING_df['gene1'].unique()).intersection(STRING_df['gene2'].unique())


print(f'UCSC unique Genes: {len(list(UCSC_unique))}')
print(f'String unique Genes: {len(list(STRING_unique))}')


common_genes = list(UCSC_unique.intersection(STRING_unique))

print(f'Common genes: {len(common_genes)}')

UCSC unique Genes: 11743
String unique Genes: 19074
Common genes: 10879


number of common genes: 10879 
genes missing: 864


In [None]:
# Filter dataframes to include only common gene names
UCSC_common = UCSC_df[UCSC_df['gene1'].isin(common_genes) & UCSC_df['gene2'].isin(common_genes)]
STRING_common = STRING_df[STRING_df['gene1'].isin(common_genes) & STRING_df['gene2'].isin(common_genes)]

In [None]:
# Identify common gene names
common_genes = set(UCSC_df['gene1']).intersection(UCSC_df['gene2']).intersection(STRING_df['gene1']).intersection(STRING_df['gene2'])

# Perform left join directly
combined_df = pd.merge(UCSC_df, STRING_df, how='left', on=['gene1', 'gene2'], indicator=True)

# Filter rows based on common gene names and create a copy
combined_common_df = combined_df[combined_df['gene1'].isin(common_genes) & combined_df['gene2'].isin(common_genes)].copy()

# Rename the indicator column to 'match'
combined_common_df.rename(columns={'_merge': 'match'}, inplace=True)

# Print the counts
print(combined_common_df[combined_common_df['match'] == 'right_only'].shape)
print(combined_common_df[combined_common_df['match'] == 'left_only'].shape)
print(combined_common_df[combined_common_df['match'] == 'both'].shape)


In [None]:
# Unique values before left join
unique_before = set(UCSC_df['gene1']).union(UCSC_df['gene2']).union(STRING_df['gene1']).union(STRING_df['gene2'])

# Unique values after left join
unique_after = set(combined_df['gene1']).union(combined_df['gene2'])

# Difference in unique values
difference = unique_before.difference(unique_after)

print("Unique values before left join: ", len(unique_before))
print("Unique values after left join: ", len(unique_after))
print("Difference in unique values: ", len(difference))

In [None]:
# Identify missing gene names
missing_gene_names = unique_before.difference(unique_after)

# Print missing gene names
print("Number of missing gene names: ", len(missing_gene_names))
print("Missing gene names: ", missing_gene_names)

In [None]:
# Filter dataframes to include only common gene names
UCSC_common = UCSC_df[UCSC_df['gene1'].isin(common_genes) & UCSC_df['gene2'].isin(common_genes)]
STRING_common = STRING_df[STRING_df['gene1'].isin(common_genes) & STRING_df['gene2'].isin(common_genes)]

# Perform left join
combined_df = UCSC_common.merge(STRING_common, how='left', on=['gene1', 'gene2'], indicator=True)

# Rename the indicator column to 'match'
combined_df.rename(columns={'_merge': 'match'}, inplace=True)

# Print the counts
print(combined_df[combined_df['match'] == 'right_only'].shape)
print(combined_df[combined_df['match'] == 'left_only'].shape)
print(combined_df[combined_df['match'] == 'both'].shape)

In [None]:
STRING_score = STRING_df[STRING_df['combined_score'] > 400].shape

In [None]:
# Plot the distribution
plt.figure(figsize=(10, 6))
plt.hist(STRING_df['combined_score'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Combined Score')
plt.xlabel('Combined Score')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Print the mean
mean_combined_score = STRING_df['combined_score'].mean()
print(f"The mean of the combined score is: {mean_combined_score}")

In [None]:
# Perform a left join between the two dataframes and add an indicator column
combined_df = UCSC_df.merge(STRING_df, how='left', on=['gene1', 'gene2'], indicator=True)

# Rename the indicator column to 'match'
combined_df.rename(columns={'_merge': 'match'}, inplace=True)


In [None]:
print(combined_df[combined_df['match'] == 'right_only'].shape)

print(combined_df[combined_df['match'] == 'left_only'].shape)

print(combined_df[combined_df['match'] == 'both'].shape)


In [None]:
combined_df['match'].unique()

In [None]:
235328 - 715308

In [None]:
combined_df[combined_df['match'] == 'right_only']

In [None]:
both_pairs = combined_df[combined_df['match'] == 'both']

print(both_pairs.shape[0])