# Small cookbook for manipulating pandas dataframes

In [None]:
import pandas as pd
import numpy as np

In [None]:
# example dataframe
X = pd.DataFrame([[1,2,3],[4,5,6],['duck','duck','goose'],[0,0,0]])
X.columns = ['col1','col2','col3']
X

# Basic column and row manipulations

In [None]:
def last_to_first(df):
    """
    This function moves the last column to the first.
    """
    cols = list(df)
    cols.insert(0, cols.pop(cols.index(cols[-1])))
    return df.loc[:, cols]

last_to_first(X)

In [None]:
def first_to_last(df):
    """
    This function moves the first column to the last
    """
    cols = list(df)
    cols.append(cols.pop(cols.index(cols[0])))
    return df.loc[:, cols]

first_to_last(X)

In [None]:
def reorder_cols(df, cols):
    """
    This function reorders pandas columns
    according to the 'cols' param
    """
    return df[cols]

reorder_cols(X,['col3','col2','col1'])

In [None]:
def return_every_n(df, n):
    """
    Takes a dataframe, returns only the nth column
    """
    return df.iloc[:, ::n]

# example dataframe where 5 are pasted together:
# columns would be: col1, col2, col3, col1, col2, col3, col1, col2, col3, col1, col2, col3, col1, col2, col3 
# new cols every 2: col1,       col3,       col2,       col1,       col3,       col2,       col1,       col3

Y = pd.concat([X,X,X,X,X],axis=1)
return_every_n(Y,2)

In [None]:
def delete_list(df, todelete):
    """
    Removes every instance of 'todelete' column from a pandas dataframe
    """
    return df.drop(df.loc[:,todelete].head(0).columns, axis=1)

delete_list(Y, 'col3')

In [None]:
def delete_list_lst(df, todelete):
    """
    same function as above (delete_list) but instead takes in a list
    """
    colnames = list(df.columns)
    colnames = list(filter(lambda x: x not in todelete, colnames))
    dx = df.drop(df.loc[:,todelete],axis=1)
    dx.columns = colnames
    return dx

delete_list_lst(Y, ['col1', 'col3'])

In [None]:
def keep_list_lst(df, tokeep):
    """
    Same function as above (delete_list) but instead keeps every column in tokeep
    Functionally also the same as reorder_cols()
    """
    dx = df[tokeep]
    return dx

keep_list_lst(Y, ['col1', 'col3'])

# Lambda and apply statements on dataframes

In [None]:
# using apply + a function:
def get_name(row):
    """
    Splits a row['name'] column to return just the transcript ID:
    
    df['ENST00000237247.6_cds_2_0_chr1_67091530_f']
    """
    return row['name'].split('_')[0]

bed_file = '/projects/ps-yeolab4/genomes/hg19/hg19.introns.bed.txt'
df = pd.read_table(bed_file,names=['chrom','start','end','name','score','strand'])
df['name'] = df.apply(get_name, axis=1)
df.head()

In [None]:
# equivalent lambda function:
bed_file = '/projects/ps-yeolab4/genomes/hg19/hg19.introns.bed.txt'
df = pd.read_table(bed_file,names=['chrom','start','end','name','score','strand'])
df['name'] = df['name'].apply(lambda x: x.split('_')[0])
df.head()

# finding all rows with a nan

In [None]:
# example dataframe:
df = pd.DataFrame([range(3), [0, np.NaN, 0], [0, 0, np.NaN], range(3), range(3)])
df

In [None]:
df[df.isnull().any(axis=1)]

# Splitting a list and transforming dataframes based on delim column splits:

In [None]:
delim = ','
X = pd.DataFrame([['ENSG1','ENSTA,ENSTB,ENSTC','some_value1'],['ENSG2','ENSTD,ENSTE',2],['ENSG3','ENSTF','some_value3']])
X.columns = ['gene','transcript','some_other']
X

In [None]:
# 'explodes' a comma-delimited dataframe based on a column
Y = pd.DataFrame(X.transcript.str.split(delim).tolist(),index=[X['gene'],X['some_other']]).stack()
Y = Y.reset_index()[[0, 'gene','some_other']]
Y.columns = ['gene','transcript','some_other']
Y

In [None]:
# generic function for doing the above:

def explode(df, delim, col_to_split, cols_to_keep):
    """
    explodes a dataframe by splitting a column on a delimiter, and 
    producing one row for each split. 
    """
    cols_to_keep_list = [y.name for y in cols_to_keep]
    dx = pd.DataFrame(df[col_to_split].str.split(delim).tolist(),index=cols_to_keep).stack()
    dx = dx.reset_index()[[0] + cols_to_keep_list]
    dx.columns = [col_to_split] + cols_to_keep_list
    return dx

explode(X, ',', 'transcript', [X['gene'],X['some_other']])

# Mask one dataframe with another

In [None]:
# example dataframe
X = pd.DataFrame([[1,2,3],[4,5,6],['duck','duck','goose'],[0,0,0]])
X.columns = ['col1','col2','col3']
X

In [None]:
# masking dataframe
Y = pd.DataFrame([[1,0,0],[1,1,1],[1,0,1],[0,0,0]])
Y.columns = ['col1','col2','col3']
Y = Y.replace(0, np.nan)
Y

In [None]:
# masking example dataframe with masking dataframe (keep only values that aren't NaN in X)
X[~np.isnan(Y)]

# Save CSV file as tabbed

In [None]:
# index = false (don't print out index, or the bolded column)
# header = false (don't print out the header index, the bolded row)
X.to_csv('/home/bay001/scratch/test_tabbed_tsv.tsv', sep='\t', index=False, header=False)

# Compare dataframes

In [None]:
dfA = pd.DataFrame(np.random.randint(0,15,size=(6, 4)), columns=list('ABCD'))
dfB = pd.DataFrame(np.random.randint(0,15,size=(6, 4)), columns=list('ABCD'))

In [None]:
dfA

In [None]:
dfB

In [None]:
from collections import defaultdict

a_not_b = defaultdict(list)
b_not_a = defaultdict(list)
a_over_b = defaultdict(list)
b_over_a = defaultdict(list)

for row in dfA.index:
    for col in dfA.columns:
        if dfA.loc[row, col] > 0 and dfB.loc[row, col] == 0:
            a_not_b[col].append(row)
        elif dfB.loc[row, col] > 0 and dfA.loc[row, col] == 0:
            b_not_a[col].append(row)
        elif dfA.loc[row, col] > dfB.loc[row, col]:
            a_over_b[col].append(row)
        elif dfB.loc[row, col] > dfA.loc[row, col]:
            b_over_a[col].append(row)

In [None]:
for key in a_over_b.keys():
    print("{} -> {}".format(key, a_over_b[key]))