#### Exploration and Visualization Pipeline
##### Generic Functions created for use with any dataframe.

##### 13th May 2018

In [142]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import matplotlib
%matplotlib inline

In [5]:
df = pd.read_csv('experiment.csv')

# EXPLORING  DATA

In [87]:
def exploring_overview(df):
    '''
    This function gives a high level view of the dataset.
    It states the attributes which exists, and provides information 
        about the dataset.
    '''
    list_of_columns = df.columns
    print ('LIST OF COLUMNS: ')
    print (list_of_columns)
    print ()
    
    print ('NUMBER OF ROWS ARE: ', df.shape[0])
    print ( 'NUMBER OF COLUMNS ARE: ', df.shape[1])
    print ()
    
    print ('SOME BASIC INFORMATION ABOUT THE ATTRIBUTES: ')
    print (df.info())
    print ()
    
    print ('DESCRIPTION OF THE DATASET:')
    print (df.describe())
    
    

In [48]:
def grouping_by_feature (feature, df):
    '''
    For a given feature, it separates the data for the different values of that feature.
        example: for a variable 'SeriousDlqin2yrs', where two values exist i.e. 0 and 1,
            the function separates the other variables and gives a grouped description of 
            how other variable statistics vary with this feature's division
    '''
    print (df.groupby(feature).mean().transpose())

In [64]:
def comparing_across_two_features (feature1, feature2, df):
    '''
    For any two features, it compares the variation in the data across those two features
    '''
    first_entry = 'df.' + feature1
    second_entry = 'df.' + feature2
    
    try:
        pd.crosstab(first_entry, second_entry)
    except:
        print ()

In [68]:
def summing_nulls_in_dataset (df):
    '''
    Sums the null values in every attribute of the dataset, and states the null values in each 
    '''
    print (df.isnull().sum())

In [85]:
def counting_in_a_variable(feature, df):
    '''
    For a given feature and dataframe,
        prints the total count of each category of that feature.
    '''
    print (df[feature].value_counts())


In [167]:
def counting_uniques(df):
    '''
    For the given dataframe, gives a sum of the unique values in each feature.
    Then prints a plot bar to represent that.
    '''
    print (df.nunique())
    print (df.nunique().plot.bar())


# VISUALIZATIONS

In [100]:
def draw_correlation_matrix (df, title):
    '''
    Creates a heatmap that shows the correlations between the different variables in a dataframe.
    
    Input:
        df: a dataframe
        title: name of the correlation_matrix
        
    Return:
        Outputs a heatmatrix showing correlations
    
    Code based on: https://stackoverflow.com/questions/29432629/correlation-matrix-using-pandas
    '''
    ax = plt.axes()
    corr = df.corr()
    sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values, ax = ax) 
    ax.set_title (title)

In [136]:
def plotting_curve (dataframe, column, title):
    '''
    Given a dataframe, a column name, and a title,
        displays a plot of that dataframe column distribution.
        
    Input:
        dataframe
        column: column name (string)
        title: string
        
    Return:
        displays a distribution of that variable
        
    Inspired by:
        https://seaborn.pydata.org/generated/seaborn.distplot.html
    '''
    try:
        ax = sns.distplot(dataframe[column])
        ax.set_title(title)
        plt.show()
    except:
        pass

In [172]:
def making_pie (df, feature):
    '''
    Gives a pie plot of data in any feature
    '''
    try: 
        df.groupby([feature]).size().plot.pie()
    except:
        pass

In [165]:
def plotting_bar (df, feature):
    '''
    Plots a bar graph based on the size of each category of that feature
    '''
    df.groupby([feature]).size().plot.bar()

In [210]:
def plotting_top_10_bar_plot  (df, feature):
    '''
    Plots a bar plot for the top 10 common values of a given feature
    '''
    df.groupby([feature]).size().sort_values().iloc[-10:].plot.bar()