# What is this Doing
Takes in a `.csv` file and outputs CSVs which showes us the changes for `'+'` and for `'-'` of each species.

### Imports

In [1]:
from itertools import product

import numpy as np
import pandas as pd

### DataFrames Initialization

In [2]:
"""
A list of reptiles we are currently interested in
"""

reptile_names = [
    'Archelosauria',
    'Archosaura',
    'Aves',
    'Chrysemys picta bellii',
    'Crocodylus porosus',
    'Episquamata',
    'Mammalia',
    'Pogona vitticeps',
    'Saura',
    'Testudines',
    'Toxicofera',
    'Varanus komodoensis',
]

In [3]:
"""
Read the main DF containing the gene change type and count
gene change type is one of `{'+', '-', '0'}`
"""

refs = ['gallus', 'human', 'lizard']
ref_changes = ['plus', 'minus']

# f'...' --> this means formatted string, what's inside the curly parentheses {} is treated as a variable. 
# In our case, {} is our changing reference.

df = pd.read_csv(f'../output/{refs[0]}/df_{ref_changes[0]}.csv', index_col='Gene Id')
df = df.sort_index()

df

Unnamed: 0_level_0,Gene Name,Acanthochromis polyacanthus,Acanthomorphata,Acanthophiinae,Accipiter nisus,Accipitrinae,Actinopterygii,Afrotheria,Ailuropoda melanoleuca,Amazona collaria,...,Vombatus ursinus,Vulpes vulpes,Xenarthra,Xenopus tropicalis,Xiphophorus couchianus,Xiphophorus maculatus,Xiphophorus,Zalophus californianus,Zonotrichia albicollis,Zosterops lateralis melanops
Gene Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSGALG00000000003,PANX2,5.0,,,,,,,,,...,,,,,5.0,,,,,
ENSGALG00000000011,C10orf88,,,,,,,,,,...,,,,2.0,,,,,,
ENSGALG00000000038,CTRB2,3.0,,,1.0,,,,3.0,1.0,...,3.0,4.0,,3.0,2.0,2.0,,5.0,1.0,
ENSGALG00000000044,WFIKKN1,,,,,,,,,,...,,,,,,,,,,
ENSGALG00000000048,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSGALG00000055118,,,,,,,,,7.0,,...,6.0,7.0,,2.0,,,,7.0,,
ENSGALG00000055122,,,,,3.0,,,,,156.0,...,,,,,,,,,1.0,
ENSGALG00000055127,,1.0,,,3.0,,,,1.0,2.0,...,2.0,,,5.0,,2.0,,,4.0,1.0
ENSGALG00000055132,,,,,3.0,,,,,156.0,...,,,,,,,,,1.0,


In [4]:
reptile_names_not_in_df = [
    name
    for name in reptile_names
    if name not in df.columns
]

reptile_names_in_df = [
    name
    for name in reptile_names
    if name in df.columns
]

reptile_names_not_in_df, reptile_names_in_df

(['Archosaura', 'Saura'],
 ['Archelosauria',
  'Aves',
  'Chrysemys picta bellii',
  'Crocodylus porosus',
  'Episquamata',
  'Mammalia',
  'Pogona vitticeps',
  'Testudines',
  'Toxicofera',
  'Varanus komodoensis'])

In [5]:
reptile_df = df[['Gene Name'] + reptile_names_in_df]

reptile_df

Unnamed: 0_level_0,Gene Name,Archelosauria,Aves,Chrysemys picta bellii,Crocodylus porosus,Episquamata,Mammalia,Pogona vitticeps,Testudines,Toxicofera,Varanus komodoensis
Gene Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSGALG00000000003,PANX2,,,,,,,,,,
ENSGALG00000000011,C10orf88,,,,,,,,,,
ENSGALG00000000038,CTRB2,,,3.0,2.0,,,,,,2.0
ENSGALG00000000044,WFIKKN1,,,,,,,,,,
ENSGALG00000000048,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
ENSGALG00000055118,,,,45.0,,,,4.0,,,2.0
ENSGALG00000055122,,,,,,,,,,,
ENSGALG00000055127,,,,132.0,6.0,,,13.0,,,8.0
ENSGALG00000055132,,,,,,,,,,,


In [None]:
# reptile_df.to_csv(f'../output/{refs[0]}/reptile_df_{ref_changes[0]}.csv')

## The actual thingy 

In [7]:
for ref, change in product(refs, ref_changes):
    df = pd.read_csv(f'../output/{ref}/df_{change}.csv', index_col='Gene Id')
    df = df.sort_index()

    reptile_names_in_df = [
        name
        for name in reptile_names
        if name in df.columns
    ]

    reptile_df = df[['Gene Name'] + reptile_names_in_df]
    reptile_df.to_csv(f'../output/{ref}/{ref}_reptile_df_{change}.csv')