Skip to content

DataFrame related tools for acquisition, parsing, plotting etc

Notifications You must be signed in to change notification settings

ajh1143/Pandas-DataFrame-Tools

Repository files navigation

Pandas DataFrame Tools

Pandas DF related tools for acquisition, parsing, plotting etc


Create Custom Data Types

Generate new categorical data tyoes, option for ordered values.

def categorical_datatype(levels, ordered):
    """ Create new categorical datatype
    :param levels: Category Levels, list/tuple form.
    :param ordered: Boolean choice, determeines if input list is in hierarchical order of entry.
    :return: New categorical datatype w/ levels and ordering choice.
    """
    if isinstance(type(levels), list):
        category_name = CategoricalDtype(levels, ordered=ordered)
        return category_name
    else:
        try:
            levels = list(levels)
        except ValueError:
            print("Please enter a valid, non-empty data structure, list/tuple of levels")
        else:
            category_name = CategoricalDtype(levels, ordered=ordered)
            return category_name

Sorting Indexes

Sort indexes of multiindex array. Native solution sort_index(by='attribute') currently deprecated.

import pandas as pd
from pandas.api.types import CategoricalDtype

def sort_mixed_dtypes(df, indexes):
    """ Sort indexes in multiindexed dataframe, helpful for multiple custom CategoricalDatatypes
    :DataFrame df: Target dataframe
    :List indexes: Index positions to sort
    :return: DataFrame with sorted index
    """
    return df.reset_index().set_index(indexes).sort_index()

Matching/Filtering

def match_case_dataframe(df, target_labels, target_column):
    """ Search a dataframe column for specific values/strings, if match return them in a new dataframe.
        Ex. Search Column A for all rows containing 'Word'
        :param df: Pandas DataFrame containing target dataset
        :param target_labels: List of labels to be matched to observations
        :param target_column: Name of column to be assayed
        :return filtered_df : DataFrame containing observations matching target labels
    """
    if isinstance(target_labels, str):
        filtered_df = df[df[target_column].isin([target_labels])]
        print('Results:')
        find_length(filtered_df, target_labels)
        print(filtered_df.head())
        return filtered_df

    else:
        filtered_df = df[df[target_column].isin(target_labels)]
        print('Results:')
        find_length(filtered_df, target_labels)
        print(filtered_df.head())
        return filtered_df


def find_length(processed_df, target):
    """ Report how many rows matched target keyword.
        :param processed_df: Filtered Pandas DataFrame containing target dataset
        :param target: Target label previously passed to 'match_case_dataframe'
        :return:
    """
    matches = len(processed_df)

    if matches <= 1:
        print('{} row returned containing \'{}\''.format(matches, target))
    else:
        print('{} rows returned containing \'{}\''.format(matches, ", ".join(target)))

Concatenate

Combine a list of dataframes into single unit, display head and shape.

def concat(df_list):
    df_output = pd.concat(df_list)
    print(df_output.shape)
    print(df_output.head())
    return df_output

Data Integrity

Impute NaN With Mean

Replace NaN observations in specified column with mean of populated column

def replace_with_mean(df, colname):
    #Generate mean values of specified column
    col_mean = np.mean(df[colname])
    #Add mean values to DataFrame.colname
    df[colname] = df[colname].fillna(col_mean)
    print(df.info())
    return df

Assert Missing

Test if dataframe has missing data via Assert statement

def check_missing_data(df):
    #Test assertion of null values
    assert pd.notnull(df).all().all()

Remove Duplicate Entries

Drop duplicates

def drop_dups(df, cols):
    #Generate a pre-processed representation of specified column
    pre_col =  pd.DataFrame(df[cols])
    #Check column info
    print(pre_col.info())
    #Drop any duplicate rows within pre_col
    post_col = pre_col.drop_duplicates()
    #Check if successful
    print(post_col.info())
    #Return the processed DataFrame
    return post_col

Merge

Many-To-Many

def merge_many(t1, t2, t3,  t1_left, t2_right, t3_left, tkey_right):
    #Merge T1 and T2 into first_merge table
    first_merge = pd.merge(left=t1, right=t2, left_on=t1_left, right_on=t2_right)
    #Merge first_merge and T3 into second_merge
    second_merge = pd.merge(left=first_merge, right=t3, left_on=t3_left, right_on=tkey_right)
    #Return 3 merged tables into a single table
    return second_merge

Plotting

Histogram

import pandas as pd
import matplotlib.pyplot as plt

#--------------------
#Function: hist_columns()
#Purpose: Easily generate a Histogram pyplot from a DataFrame and list of columns 
#Inputs: Pandas DataFrame, Log condition(T/F), Rotation value, single or variable column names
#Outputs: Histogram plots of given column in Pandas DataFrame
#--------------------

def hist_columns(df, log, rot, *args):
    # For each column name given, check if log axes chosen, plot histogram
    for arg in args:
        #Check if user passed a logarithmic scale designation
        if log == "True":
            #Create Histogram plot
            df[arg].plot(kind='hist', logx=True, logy=True, rot=rot)
            #Add x-axis label
            plt.xlabel(str(arg))
            #Show plot
            plt.show()
        #No log scale specified
        else:
            #Create Histogram plot
            df[arg].plot(kind='hist', logx=False, logy=False, rot=rot)
            #Add x-axis label
            plt.xlabel(str(arg))
            #Show plot
            plt.show()

Scatter Plot

#--------------------
#Purpose: Generate a scatter plot from given datasets 
#Inputs: X-axis data, Y-axis data, X-Axis Label, Y-Axis Label, Graph Title
#Outputs: Populated Scatter Plot with Labels/Title and Legend
#--------------------

def make_scatter(x, y, xlab, ylab, gtitle):
    plt.scatter(x = x, y = y)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(gtitle)
    plt.legend(loc=1)
    plt.grid()
    plt.show()

Time Series Line Plot

#--------------------
#Purpose: Generates a TimeSeries Line Plot with labels
#Inputs: Pandas DataFrame, Independent Variable, Dependent Variable, X-Label, Y-Label, Graph Title
#Outputs: TimeSeries LinePlot
#--------------------

def make_timeseries(df, x, y, xlab, ylab, gtitle):
    #Generate a line plot
    df.plot(kind='line', x=x, y=y)
    #Add X&Y axis labels
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    #Add graph title
    plt.title(gtitle)
    #Add Legend in position 1
    plt.legend(loc=1)
    #Add gridlines to graph
    plt.grid()
    #Show plot
    plt.show()