# Small cookbook for manipulating pandas dataframes

In [1]:
import pandas as pd
import numpy as np

In [2]:
# example dataframe
X = pd.DataFrame([[1,2,3],[4,5,6],['duck','duck','goose'],[0,0,0]])
X.columns = ['col1','col2','col3']
X

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6
2,duck,duck,goose
3,0,0,0


# Basic column and row manipulations

In [3]:
def last_to_first(df):
    """
    This function moves the last column to the first.
    """
    cols = list(df)
    cols.insert(0, cols.pop(cols.index(cols[-1])))
    return df.loc[:, cols]

last_to_first(X)

Unnamed: 0,col3,col1,col2
0,3,1,2
1,6,4,5
2,goose,duck,duck
3,0,0,0


In [4]:
def first_to_last(df):
    """
    This function moves the first column to the last
    """
    cols = list(df)
    cols.append(cols.pop(cols.index(cols[0])))
    return df.loc[:, cols]

first_to_last(X)

Unnamed: 0,col2,col3,col1
0,2,3,1
1,5,6,4
2,duck,goose,duck
3,0,0,0


In [5]:
def reorder_cols(df, cols):
    """
    This function reorders pandas columns
    according to the 'cols' param
    """
    return df[cols]

reorder_cols(X,['col3','col2','col1'])

Unnamed: 0,col3,col2,col1
0,3,2,1
1,6,5,4
2,goose,duck,duck
3,0,0,0


In [6]:
def return_every_n(df, n):
    """
    Takes a dataframe, returns only the nth column
    """
    return df.iloc[:, ::n]

# example dataframe where 5 are pasted together:
# columns would be: col1, col2, col3, col1, col2, col3, col1, col2, col3, col1, col2, col3, col1, col2, col3 
# new cols every 2: col1,       col3,       col2,       col1,       col3,       col2,       col1,       col3

Y = pd.concat([X,X,X,X,X],axis=1)
return_every_n(Y,2)

Unnamed: 0,col1,col3,col2,col1.1,col3.1,col2.1,col1.2,col3.2
0,1,3,2,1,3,2,1,3
1,4,6,5,4,6,5,4,6
2,duck,goose,duck,duck,goose,duck,duck,goose
3,0,0,0,0,0,0,0,0


In [7]:
def delete_list(df, todelete):
    """
    Removes every instance of 'todelete' column from a pandas dataframe
    """
    return df.drop(df.loc[:,todelete].head(0).columns, axis=1)

delete_list(Y, 'col3')

Unnamed: 0,col1,col2,col1.1,col2.1,col1.2,col2.2,col1.3,col2.3,col1.4,col2.4
0,1,2,1,2,1,2,1,2,1,2
1,4,5,4,5,4,5,4,5,4,5
2,duck,duck,duck,duck,duck,duck,duck,duck,duck,duck
3,0,0,0,0,0,0,0,0,0,0


In [8]:
def delete_list_lst(df, todelete):
    """
    same function as above (delete_list) but instead takes in a list
    """
    colnames = list(df.columns)
    colnames = list(filter(lambda x: x not in todelete, colnames))
    dx = df.drop(df.loc[:,todelete],axis=1)
    dx.columns = colnames
    return dx

delete_list_lst(Y, ['col1', 'col3'])

Unnamed: 0,col2,col2.1,col2.2,col2.3,col2.4
0,2,2,2,2,2
1,5,5,5,5,5
2,duck,duck,duck,duck,duck
3,0,0,0,0,0


In [9]:
def keep_list_lst(df, tokeep):
    """
    Same function as above (delete_list) but instead keeps every column in tokeep
    Functionally also the same as reorder_cols()
    """
    dx = df[tokeep]
    return dx

keep_list_lst(Y, ['col1', 'col3'])

Unnamed: 0,col1,col1.1,col1.2,col1.3,col1.4,col3,col3.1,col3.2,col3.3,col3.4
0,1,1,1,1,1,3,3,3,3,3
1,4,4,4,4,4,6,6,6,6,6
2,duck,duck,duck,duck,duck,goose,goose,goose,goose,goose
3,0,0,0,0,0,0,0,0,0,0


# Lambda and apply statements on dataframes

In [10]:
# using apply + a function:
def get_name(row):
    """
    Splits a row['name'] column to return just the transcript ID:
    
    df['ENST00000237247.6_cds_2_0_chr1_67091530_f']
    """
    return row['name'].split('_')[0]

bed_file = '/projects/ps-yeolab/genomes/hg19/hg19.introns.bed.txt'
df = pd.read_table(bed_file,names=['chrom','start','end','name','score','strand'])
df['name'] = df.apply(get_name, axis=1)
df.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,66999090,66999928,ENST00000237247.6,0,+
1,chr1,67000051,67091529,ENST00000237247.6,0,+
2,chr1,67091593,67098752,ENST00000237247.6,0,+
3,chr1,67098777,67099762,ENST00000237247.6,0,+
4,chr1,67099846,67105459,ENST00000237247.6,0,+


In [11]:
# equivalent lambda function:
bed_file = '/projects/ps-yeolab/genomes/hg19/hg19.introns.bed.txt'
df = pd.read_table(bed_file,names=['chrom','start','end','name','score','strand'])
df['name'] = df['name'].apply(lambda x: x.split('_')[0])
df.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,66999090,66999928,ENST00000237247.6,0,+
1,chr1,67000051,67091529,ENST00000237247.6,0,+
2,chr1,67091593,67098752,ENST00000237247.6,0,+
3,chr1,67098777,67099762,ENST00000237247.6,0,+
4,chr1,67099846,67105459,ENST00000237247.6,0,+


# finding all rows with a nan

In [12]:
# example dataframe:
df = pd.DataFrame([range(3), [0, np.NaN, 0], [0, 0, np.NaN], range(3), range(3)])
df

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,0,,0.0
2,0,0.0,
3,0,1.0,2.0
4,0,1.0,2.0


In [13]:
df[df.isnull().any(axis=1)]

Unnamed: 0,0,1,2
1,0,,0.0
2,0,0.0,


# Splitting a list and transforming dataframes based on delim column splits:

In [14]:
delim = ','
X = pd.DataFrame([['ENSG1','ENSTA,ENSTB,ENSTC','some_value1'],['ENSG2','ENSTD,ENSTE',2],['ENSG3','ENSTF','some_value3']])
X.columns = ['gene','transcript','some_other']
X

Unnamed: 0,gene,transcript,some_other
0,ENSG1,"ENSTA,ENSTB,ENSTC",some_value1
1,ENSG2,"ENSTD,ENSTE",2
2,ENSG3,ENSTF,some_value3


In [15]:
# 'explodes' a comma-delimited dataframe based on a column
Y = pd.DataFrame(X.transcript.str.split(delim).tolist(),index=[X['gene'],X['some_other']]).stack()
Y = Y.reset_index()[[0, 'gene','some_other']]
Y.columns = ['gene','transcript','some_other']
Y

Unnamed: 0,gene,transcript,some_other
0,ENSTA,ENSG1,some_value1
1,ENSTB,ENSG1,some_value1
2,ENSTC,ENSG1,some_value1
3,ENSTD,ENSG2,2
4,ENSTE,ENSG2,2
5,ENSTF,ENSG3,some_value3


In [16]:
# generic function for doing the above:

def explode(df, delim, col_to_split, cols_to_keep):
    """
    explodes a dataframe by splitting a column on a delimiter, and 
    producing one row for each split. 
    """
    cols_to_keep_list = [y.name for y in cols_to_keep]
    dx = pd.DataFrame(df[col_to_split].str.split(delim).tolist(),index=cols_to_keep).stack()
    dx = dx.reset_index()[[0] + cols_to_keep_list]
    dx.columns = [col_to_split] + cols_to_keep_list
    return dx

explode(X, ',', 'transcript', [X['gene'],X['some_other']])

Unnamed: 0,transcript,gene,some_other
0,ENSTA,ENSG1,some_value1
1,ENSTB,ENSG1,some_value1
2,ENSTC,ENSG1,some_value1
3,ENSTD,ENSG2,2
4,ENSTE,ENSG2,2
5,ENSTF,ENSG3,some_value3


# Mask one dataframe with another

In [17]:
# example dataframe
X = pd.DataFrame([[1,2,3],[4,5,6],['duck','duck','goose'],[0,0,0]])
X.columns = ['col1','col2','col3']
X

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6
2,duck,duck,goose
3,0,0,0


In [18]:
# masking dataframe
Y = pd.DataFrame([[1,0,0],[1,1,1],[1,0,1],[0,0,0]])
Y.columns = ['col1','col2','col3']
Y = Y.replace(0, np.nan)
Y

Unnamed: 0,col1,col2,col3
0,1.0,,
1,1.0,1.0,1.0
2,1.0,,1.0
3,,,


In [19]:
# masking example dataframe with masking dataframe (keep only values that aren't NaN in X)
X[~np.isnan(Y)]

Unnamed: 0,col1,col2,col3
0,1,,
1,4,5.0,6
2,duck,,goose
3,,,
