# A Data Frame Tutorial 

## Scope 

- DataFrames bits


In [3]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
import warnings
import datetime 
print("Last updated on ", time.asctime())

Last updated on  Fri Nov 16 10:13:12 2018


### Notebook configuration

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = 10, 8
plt.rcParams["font.size"     ] = 14

In [5]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import tables            as tb
import random
import glob
import warnings
sns.set()

### Read DF

In [6]:
ls

dataFrames.ipynb  sample_data.csv


In [73]:
df = pd.read_csv('./sample_data.csv', index_col=0)
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2


# Sublcassing

### Simple case: just provide a type

In [86]:
class MyDF(pd.DataFrame):
    @property
    def _construct(self):
        return MyDF


### This creates a slice of df (e.g, another view of the same df)

In [87]:
mdf = MyDF(df)

In [88]:
mdf

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


### it has the type MyDF

In [77]:
type(mdf)

__main__.MyDF

### One can access by symbol 

In [89]:
mdf.state

Jane          NY
Niko         BAN
Aaron        BIL
Penelope     ORA
Dean         NEW
Christina    MAD
Cornelia     CAT
Name: state, dtype: object

#### Or by string

In [90]:
mdf['state']

Jane          NY
Niko         BAN
Aaron        BIL
Penelope     ORA
Dean         NEW
Christina    MAD
Cornelia     CAT
Name: state, dtype: object

#### This is a slice of df. If we change it we change the original df

In [91]:
mdf.state=['VAL','BAN','BIL','ORA','NEW','MAD','CAT']

In [92]:
mdf

Unnamed: 0,state,color,food,age,height,score
Jane,VAL,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


In [93]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,VAL,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


In [97]:
mdf.loc['Jane','state']='KKK'

In [98]:
mdf

Unnamed: 0,state,color,food,age,height,score
Jane,KKK,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


In [99]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,KKK,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


### Construct from a subset of df will create a copy, e.g, a different object

In [100]:
mdf = MyDF(df[['state','color']])

In [101]:
mdf

Unnamed: 0,state,color
Jane,KKK,blue
Niko,BAN,green
Aaron,BIL,red
Penelope,ORA,white
Dean,NEW,gray
Christina,MAD,black
Cornelia,CAT,red


In [102]:
mdf.state=['AAA','BBB','CCC','DDD','EEE','FFF','GGG']

In [103]:
mdf

Unnamed: 0,state,color
Jane,AAA,blue
Niko,BBB,green
Aaron,CCC,red
Penelope,DDD,white
Dean,EEE,gray
Christina,FFF,black
Cornelia,GGG,red


In [104]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,KKK,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


In [105]:
mdf.loc['Jane','state']='NY'

In [106]:
mdf

Unnamed: 0,state,color
Jane,NY,blue
Niko,BBB,green
Aaron,CCC,red
Penelope,DDD,white
Dean,EEE,gray
Christina,FFF,black
Cornelia,GGG,red


In [107]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,KKK,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


### Construct from a dictionary (and expand the df)

In [108]:
mdf = MyDF(dict(state=df.state.values, color=df.color.values, age=[55,56,57,48,12,13,78]), index=df.index)

In [109]:
mdf

Unnamed: 0,state,color,age
Jane,KKK,blue,55
Niko,BAN,green,56
Aaron,BIL,red,57
Penelope,ORA,white,48
Dean,NEW,gray,12
Christina,MAD,black,13
Cornelia,CAT,red,78


In [110]:
mdf.state

Jane         KKK
Niko         BAN
Aaron        BIL
Penelope     ORA
Dean         NEW
Christina    MAD
Cornelia     CAT
Name: state, dtype: object

In [111]:
mdf.state=['NY','TX','FL','AL','AK','TX','TX']

In [112]:
mdf

Unnamed: 0,state,color,age
Jane,NY,blue,55
Niko,TX,green,56
Aaron,FL,red,57
Penelope,AL,white,48
Dean,AK,gray,12
Christina,TX,black,13
Cornelia,TX,red,78


In [113]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,KKK,blue,Steak,30,165,4.6
Niko,BAN,green,Lamb,2,70,8.3
Aaron,BIL,red,Mango,12,120,9.0
Penelope,ORA,white,Apple,4,80,3.3
Dean,NEW,gray,Cheese,32,180,1.8
Christina,MAD,black,Melon,33,172,9.5
Cornelia,CAT,red,Beans,69,150,2.2


In [131]:
df.index

Index(['Jane', 'Niko', 'Aaron', 'Penelope', 'Dean', 'Christina', 'Cornelia'], dtype='object')

### A DF decorator

In [123]:
class Column: pass

class Complain(Exception): pass

def SpecificDataFrame(cls):
    namespace = {}
    allowed_columns = set()
    for name, value in cls.__dict__.items():
        if value is Column: allowed_columns.add(name)
        else              : namespace[name] = value

    new_cls = type(cls.__name__, (pd.DataFrame,) + cls.__bases__, namespace)

    def _constructor(self):
        return new_cls

    def __init__(self, *args, **kwds):
        super(new_cls, self).__init__(*args, **kwds)
        present_columns = set(self.columns)
        illegal_columns = present_columns - allowed_columns
        if illegal_columns:
            raise Complain(f"{cls.__name__} does not admit columns: {', '.join(illegal_columns)}")
                           
    new_cls._constructor = _constructor
    new_cls.__init__     = __init__

    return new_cls



### Now one is able to define the columns of the DF

In [124]:
@SpecificDataFrame
class MyDF:
    state   = Column
    color   = Column
    country = Column

In [125]:
mdf = MyDF(dict(state=df.state.values, color=df.color.values, country=['UK','FIN','SP','GER','USA','FRAN','POL']))

In [126]:
mdf

Unnamed: 0,state,color,country
0,KKK,blue,UK
1,BAN,green,FIN
2,BIL,red,SP
3,ORA,white,GER
4,NEW,gray,USA
5,MAD,black,FRAN
6,CAT,red,POL


In [127]:
mdf.state

0    KKK
1    BAN
2    BIL
3    ORA
4    NEW
5    MAD
6    CAT
Name: state, dtype: object

### Now we get a complain if we try to construct with different columns

In [129]:
mdf = MyDF(df)

Complain: MyDF does not admit columns: food, score, age, height

In [130]:
mdf = MyDF(dict(state=df.state.values, color=df.color.values, age=[55,56,57,48,12,13,78]))

Complain: MyDF does not admit columns: age

### Outlook

1. Subclassing allows to create a type from a DF. 
2. Using the SpecificDataFrame decorator one can define the columns of the DF
3. The decorator also protects agains undeclared columns.