In [None]:
## 1. Load libraries and set env 
import warnings
warnings.filterwarnings('ignore')

## data structures and manipulation 
import pandas as pd
import numpy as np 

## visualization 
import seaborn as sns
import matplotlib.pyplot as plt 

## particular datatypes manipulation 
from dateutil.parser import parse

## statistical analysis - Ho-testing and glm Regression 
import statsmodels.api as sm
import statsmodels.formula.api as smf
## patsy is a Python library for describing statistical models and building Design Matrices using R-like formulas.
import patsy 

import scipy.stats as sci 
import math 


## Shared Env flags/config 

%matplotlib inline 
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf', 'png')


## plot styles 
plt.style.use('seaborn-white')
plt.style.use('ggplot') 
sns.set(color_codes=True)


## allow multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Getting R in the mix =D  << TODO: put R libs in a seperate Rscript file and load it from here<<< Excited!!!!!
# %load_ext rpy2.ipython 
# %R require(ggplot2); require(tidyr); require(dplyr) 
%load_ext rmagic

In [2]:
### Formatting preambles
#plt.rcParams['savefig.dpi'] = 75

#plt.rcParams['figure.autolayout'] = False
#plt.rcParams['figure.figsize'] = 10, 6
#plt.rcParams['axes.labelsize'] = 18
#plt.rcParams['axes.titlesize'] = 20
#plt.rcParams['font.size'] = 16
#plt.rcParams['lines.linewidth'] = 2.0
#plt.rcParams['lines.markersize'] = 8
#plt.rcParams['legend.fontsize'] = 14

#plt.rcParams['text.usetex'] = True
#plt.rcParams['font.family'] = "serif"
#plt.rcParams['font.serif'] = "cm"
#plt.rcParams['text.latex.preamble'] = "\usepackage{subdepth}, \usepackage{type1cm}"

In [5]:
# help(patsy)
## Functions 

def plotObject():
    fig, ax = plt.subplots( figsize=(10,7))
    return fig, ax


def sampleStructure(dtset, colz=None, grp=None, rnd=1, fx=None ):
    fx  =  fx if fx else [np.mean, sum ] # np.unique,
    if colz and grp: return dtset[colz+[grp]].groupby(grp).agg( fx ).round(rnd) 
    elif colz: return dtset[colz].agg( fx ).round(rnd)
    elif grp: return dtset.groupby(grp).agg(fx ).round(rnd)
    else: return dtset.agg( fx ).round(rnd)


def zDescribe( dtset, colz=None, rnd=2):
    colz = colz if colz else dtset.columns()
    
    return pd.DataFrame( { 
        "Sum": og_only_dataset[ colz  ].sum().round(rnd), 
        "Avg": og_only_dataset[ colz ].mean().round(rnd),
        "Min": og_only_dataset[ colz  ].min().round(rnd),
        "Max" : og_only_dataset[ colz  ].max().round(rnd),
        "SD" : og_only_dataset[ colz  ].max().round(rnd),
              }).T


    
## TODO: with margins + fq% 
def summarizer(dtset, colz=None, fxz=None, incdz=None ): 
    pct = [.2, .4, .5, .6, .8]
    incdz='all'
    if( colz): return dtset[colz].describe( percentiles=pct, include=incdz).round(2)
    else: return dtset.describe(percentiles=pct, include=incdz).round(2)


## TODO: update to sns and with group by
def plotDistz(dtset, colz=None, grp=None, stk=False):
    fig, ax = plotObject()
    
    if colz and grp: return dtset.groupby(grp).hist(colz, stacked=stk,ax=ax)
    elif colz: return dtset.hist(colz, stacked=stk,ax=ax)
    elif grp: return dtset.groupby(grp).hist(stacked=stk,ax=ax)
    else: return dtset.hist(stacked=stk,ax=ax)

    #if( grpz ) return dtset.plot( colz, kind="hist", legend=True, alpha=0.4, ax=ax ) 
    #return dtset.hist(colz,stacked=True,ax=ax)


def plotBoxz(dtset, colz=None, grp=None):
    fig, ax = plotObject()
    
    if colz and grp: return dtset.boxplot( colz, grp, ax=ax) 
    elif colz: return dtset.boxplot( colz, ax=ax)
    elif grp: return dtset.boxplot( grp, ax=ax)
    else: return dtset.boxplot( ax=ax)

def fqTablez( dtset, grp, colz=None, margz=True, pct=False):
    colz = colz if colz else dtset.columns() 
    
    return pd.Series( colz ).apply( lambda x: pd.crosstab( dataset[x], grp, margins=margz) ) 
    #pd.crosstab( dataset.Females, dataset["Age Group"], margins=True).apply( lambda x: x/len(dataset)).round(4)*100
    
def frequenciez(dtset, colz=None, grp=None, flipaxis=False, margz=True):
    #if( grp):  grp = pd.Grouper(key=grp, sort=True)
    # 1.tabulate 
    # pd.crosstab( dtset[colz], dtset[grp], margins=margz)
    
    # 2.barplot  -- factorplot  
    if( flipaxis):
        if(colz and grp): return pd.Series(colz).apply(lambda x: sns.countplot(y=x, hue=grp, data=dtset) )
        elif(colz): return pd.Series(colz).apply(lambda x: sns.countplot( y=x, data=dtset) )
        elif( grp ): 
            colz = dtset.columns() 
            return pd.Series(colz).apply(lambda x: sns.countplot( y=x, hue=grp, data=dtset) )
        else: 
            colz = dtset.columns() 
            return pd.Series(colz).apply(lambda x: sns.countplot( y=x, data=dtset) )
    
    else:
        if(colz and grp): return pd.Series(colz).apply(lambda x: sns.countplot(x=x, hue=grp, data=dtset) )
        elif(colz): return pd.Series(colz).apply(lambda x: sns.countplot( x=x, data=dtset) )
        elif( grp ): 
            colz = dtset.columns() 
            return pd.Series(colz).apply(lambda x: sns.countplot( x=x, hue=grp, data=dtset) )
        else: 
            colz = dtset.columns() 
            return pd.Series(colz).apply(lambda x: sns.countplot( x=x, data=dtset) )

    

def corrz(dtset, colz, mthd="pearson"):
    return dtset[colz].corr( method=mthd).round(3)


def plotCorrz( cor_outz ):
    # Generate a mask for the upper triangle
    mask = np.zeros_like(cor_outz, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(7, 7))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    #TODO: pretty print tables : cor_outz, 
    return (cor_outz, sns.heatmap(cor_outz, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}) )

## TODO: generalize to smalls 
def quintilize(dtset, col, nbGrpz=5, labelzPrefix="Quint"):
    labelz = []
    for i in range(nbGrpz):
        labelz.append( labelzPrefix+"_"+str(i+1))
    dtset["Quints_"+col] = pd.qcut( dtset[col], nbGrpz, labels=labelz, duplicates='drop') 
    

def checkMissing(dtset):
    #dataset.fillna( np.nan )
    return "\nShape # records, # variables", dtset.shape, "\n\n\n Counts of Missing Values \n", dtset.apply( lambda x: sum(x.isnull() ), axis=0)






In [3]:
## Datasets path
from pathlib import Path, PureWindowsPath
import re

#DATA_DIR = Path("~/Google\ Drive/xRepos/datasets") 
# Path("~").replace('\\', '/') 
#DATA_DIR = re.sub(r'\\', r'/',  ) + "Google\ Drive/xRepos/datasets" 
#re.sub("[\\]", "/", Path( "~")) 


In [37]:
# https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07  --- R to Py tricks