# Install Dependencies

In [25]:
import pandas as pd

# Cho Preprocessing
By _sourced_ dataset, we mean the dataset that was retrieved from Calderon and Ventures' study. On the other hand, the _raw_ dataset comes from our own retrieval of the dataset from the Yeung et al (https://faculty.washington.edu/kayee/cluster/)

## Clean Trailing Whitespace of Datasets

In [None]:
def clean_raw_cho_dataset():
  with open('../dataset/raw/cho_original.txt', 'r') as file:
    lines = [line.strip() for line in file]
  with open ('../dataset/raw/cho_cleaned.txt', 'w') as file:
    for line in lines:
      file.write(line + '\n')

def clean_sourced_cho_dataset():
  lines = []
  with open('../dataset/sourced/cho_ventures.txt', 'r') as file:
    for index, line in enumerate(file):
      if index == 0:
        # In the header row, we omit the extraneous 'Gp' column
        header = line.strip().split('\t')
        header.remove('Gp')
        lines.append('\t'.join(header))
      else:
        # Else, we just append the line to the list of lines
        lines.append(line.strip())

  with open ('../dataset/sourced/cho_ventures_cleaned.txt', 'w') as file:
    for line in lines:
      file.write(line + '\n')

def clean_sourced_kegg_dataset():
  lines = []
  with open('../dataset/sourced/kegg_ventures.txt', 'r') as file:
    for index, line in enumerate(file):
      if index == 0:
        # In the header row, we omit the extraneous 'Gp' column
        header = line.strip().split('\t')
        header.remove('Gp')
        lines.append('\t'.join(header))
      else:
        # Else, we just append the line to the list of lines
        lines.append(line.strip())

  with open ('../dataset/sourced/kegg_ventures_cleaned.txt', 'w') as file:
    for line in lines:
      file.write(line + '\n')
# Generate the cleaned dataset. We only need to run this once.
# clean_cho_dataset()
# clean_sourced_cho_dataset()
# clean_sourced_kegg_dataset()

## Comparing Raw Cho vs Sourced Cho

In [None]:
raw_cho_data = pd.read_csv('../dataset/raw/cho_cleaned.txt', sep='\t', header=0)
print(raw_cho_data.head())
print("========================================")
print("No null values in raw_cho_data!")
print(raw_cho_data.isna().sum())


      Gene  Gp       c1       c2       c3       c4       c5       c6       c7  \
0  YDL179w   1 -0.75808 -0.90319 -0.98935 -0.73995 -0.67193 -0.12777 -0.95307   
1  YLR079w   1 -0.48845 -0.70828 -0.47688 -0.65814 -0.45374 -0.47302 -0.71214   
2  YER111c   1 -0.42218  0.23887  1.84427 -0.02083 -0.61105 -0.65827 -0.79992   
3  YBR200w   1  0.09824  0.55258 -0.89641 -1.19111 -1.11744 -0.76133  0.09824   
4  YJL194w   1 -1.29859  1.71422 -0.52745 -1.11926 -0.63505 -0.02532 -0.36605   

        c8       c9      c10      c11      c12      c13      c14      c15  \
0 -1.01656  0.79730  2.11688  1.98537  0.61591  0.56603 -0.13684 -0.52228   
1 -1.02839  0.24048  3.11376  1.28952  0.44874  0.04379 -0.31104 -0.30332   
2 -0.39857 -0.09166  2.03314  1.58457  0.68744  0.14443 -0.72910 -1.46097   
3  2.16120  1.46126  1.03148  0.67537 -0.33155 -0.60170 -1.39987 -0.42978   
4 -0.76059  1.44522  2.05496 -0.22259  0.20782 -0.36605  0.01055 -0.77852   

       c16      c17  
0 -0.05068  0.78823  
1 -0.3

In [58]:
sourced_cho_data = pd.read_csv('../dataset/sourced/cho_ventures_cleaned.txt', sep='\t')
print(sourced_cho_data.head())
print("========================================")
print("No null values in sourced_cho_data!")
print(sourced_cho_data.isna().sum())

      Gene  Level_1  Level_2       c1       c2       c3       c4       c5  \
0  YDL179w        1       11 -0.75808 -0.90319 -0.98935 -0.73995 -0.67193   
1  YLR079w        1       11 -0.48845 -0.70828 -0.47688 -0.65814 -0.45374   
2  YER111c        1       11 -0.42218  0.23887  1.84427 -0.02083 -0.61105   
3  YBR200w        1       12  0.09824  0.55258 -0.89641 -1.19111 -1.11744   
4  YJL194w        1       13 -1.29859  1.71422 -0.52745 -1.11926 -0.63505   

        c6       c7       c8       c9      c10      c11      c12      c13  \
0 -0.12777 -0.95307 -1.01656  0.79730  2.11688  1.98537  0.61591  0.56603   
1 -0.47302 -0.71214 -1.02839  0.24048  3.11376  1.28952  0.44874  0.04379   
2 -0.65827 -0.79992 -0.39857 -0.09166  2.03314  1.58457  0.68744  0.14443   
3 -0.76133  0.09824  2.16120  1.46126  1.03148  0.67537 -0.33155 -0.60170   
4 -0.02532 -0.36605 -0.76059  1.44522  2.05496 -0.22259  0.20782 -0.36605   

       c14      c15      c16      c17  
0 -0.13684 -0.52228 -0.05068  0.78

In [62]:
raw_cho_data_genes = raw_cho_data['Gene']
sourced_cho_data_genes = sourced_cho_data['Gene']

print("========================================")
print("What genes are missing in the sourced dataset that is present in the raw dataset?")
missing_genes = raw_cho_data_genes[~raw_cho_data_genes.isin(sourced_cho_data_genes)]
print("Count: ", missing_genes.count())
print(missing_genes)


What genes are missing in the sourced dataset that is present in the raw dataset?
Count:  214
10     YGR044c
11     YML109w
15     YNR001c
16     YKL150w
18     YOR065w
        ...   
379    YOL070c
380    YLR297W
381    YHL028W
382    YHR151C
383    YNL058C
Name: Gene, Length: 214, dtype: object
