<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Pandas-utils" data-toc-modified-id="Pandas-utils-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pandas utils</a></span><ul class="toc-item"><li><span><a href="#Style" data-toc-modified-id="Style-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Style</a></span></li></ul></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Pandas-Utilities" data-toc-modified-id="Pandas-Utilities-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Pandas Utilities</a></span><ul class="toc-item"><li><span><a href="#Reduce-dataframe-memory:-df_shrink" data-toc-modified-id="Reduce-dataframe-memory:-df_shrink-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Reduce dataframe memory: df_shrink</a></span></li><li><span><a href="#Create-datetime-columns" data-toc-modified-id="Create-datetime-columns-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Create datetime columns</a></span></li><li><span><a href="#cont-cat-split" data-toc-modified-id="cont-cat-split-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>cont cat split</a></span></li></ul></li></ul></div>

# Load the libraries

In [1]:
import numpy as np
import pandas as pd
import os,sys,time,re
import seaborn as sns

pd.set_option('display.max_columns',100)

In [2]:
# my local library
import sys
from pathlib import Path
import platform

if platform.system() == 'Windows':
    p = Path("~/OneDrive - AmerisourceBergen(ABC)/bhishan").expanduser()
elif platform.system() == 'Darwin':
    p = Path.home() / "Dropbox/a00_Bhishan_Modules"
sys.path.append(str(p))
from bhishan import bp

## Style

In [3]:
bp.show_methods(bp,contains='highlight')

Unnamed: 0,0,1,2
0,highlight_code,highlight_getsource,


# Load the data

In [4]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


In [5]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
df['date'] = pd.date_range('2021-01-01',periods=df.shape[0])
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,date
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2021-01-01
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2021-01-02


# Pandas Utilities

## Reduce dataframe memory: df_shrink

In [7]:
df.dtypes

survived                int64
pclass                  int64
sex                    object
age                   float64
sibsp                   int64
parch                   int64
fare                  float64
embarked               object
class                category
who                    object
adult_male               bool
deck                 category
embark_town            object
alive                  object
alone                    bool
date           datetime64[ns]
dtype: object

In [8]:
df2 = bp.df_shrink(df)
df2.dtypes

survived                 int8
pclass                   int8
sex                  category
age                   float32
sibsp                    int8
parch                    int8
fare                  float32
embarked             category
class                category
who                  category
adult_male               bool
deck                 category
embark_town          category
alive                category
alone                    bool
date           datetime64[ns]
dtype: object

## Create datetime columns

In [9]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,date
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2021-01-01
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2021-01-02


In [10]:
df2 = bp.add_datepart(df,'date')
df2.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2021,1,53,1,4,1,False,True,False,True,False,True,1609459200
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2021,1,53,2,5,2,False,False,False,False,False,False,1609545600


In [11]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,date
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2021-01-01
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2021-01-02


## cont cat split

In [12]:
bp.cont_cat_split(df, max_card=20, dep_var=None)

(['age', 'fare'],
 ['survived',
  'pclass',
  'sex',
  'sibsp',
  'parch',
  'embarked',
  'class',
  'who',
  'adult_male',
  'deck',
  'embark_town',
  'alive',
  'alone',
  'date'])