In [None]:
import numpy as np
import pandas as pd

# `Series`

- `Series` -- 1D labeled array that can hold any data type.
- **Index** -- Axis labels
- Create a `Series`: `s = pd.Series(data, index=index)`
  - Python dict
  - `ndarray`
  - Scalar value

In [None]:
# Create Series from ndarray
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

In [None]:
# Show index of Series
s.index

In [None]:
# Create Series from dict

d = {
    'b': 1,
    'a': 0,
    'c': 2
}

pd.Series(d, index=['a', 'b', 'c', 'd'])

In [None]:
# Create Series from scalar value
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

## `Series` is ndarray-like

In [None]:
# Access element of Series
s.iloc[0]

In [None]:
# Slice elements of Series
s.iloc[:3]

In [None]:
# Use Boolean indexing to filter Series
s[s > s.median()]

In [None]:
# Use numpy functions on Series
np.exp(s)

In [None]:
# Check dtype of Series
s.dtype

In [None]:
# Get array of values in Series
s.array

In [None]:
# Convert Series to ndarray
s.to_numpy()

## `Series` is dict-like

In [None]:
# Access values by index label
s['a']

In [None]:
# Set values by index label
s['e'] = 12
s

In [None]:
# Check if index label exists
'e' in s

In [None]:
# Check if index label exists
'f' in s

In [None]:
# Exception raised if index label does not exist
# 'get' method can be used to avoid exception, return default value if label is not found
# s['f']
s.get('f', np.nan)

## Vectorized operations and label alignment with `Series`

In [None]:
# Vectorized add
s + s

In [None]:
# Vectorized multiply
s * 2

In [None]:
# Vectorized exponentiation
np.exp(s)

In [None]:
# Perform computations on two Series without worrying about whether they have the same labels
s[1:] + s[:-1]

## Name attribute

In [None]:
# Name Series
s = pd.Series(np.random.randn(5), name='something')
s

In [None]:
# Show series name
s.name

In [None]:
# Rename Series
s2 = s.rename('different')
s2

# `DataFrame`

- 2D structure with columns of potentially different types
- Like a spreadsheet or SQL table, or a dict of `Series` objects
- DataFrames can be created from: 
  - Dict of...
    - 1D `ndarray`s
    - Lists
    - Dicts
    - `Series`
  - 2D `ndarray`s
  - `Series`
  - `DataFrame`

## Creating a `DataFrame`

### From dict of `Series`

In [None]:
# Creating a dict of Series
d = {
    'one': pd.Series([1.0, 2.0, 3.0], index=['a', 'b', 'c']),
    'two': pd.Series([1.0, 2.0, 3.0, 4.0], index=['a', 'b', 'c', 'd'])
}
d

In [None]:
# Creating a DataFrame from a dict of Series
df = pd.DataFrame(d)
df

In [None]:
# Creating a DataFrame from a dict of Series
# Specifying index values
df = pd.DataFrame(d, index=['d', 'b', 'a'])
df

In [None]:
# Creating a DataFrame from a dict of Series
# Specifying index values and column names
df = pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
df

In [None]:
# Index labels of DataFrame
df.index

In [None]:
# Column names of DataFrame
df.columns

### From dict of `ndarray`s/lists

In [None]:
# Create dict of lists
d = {
    'one': [1.0, 2.0, 3.0, 4.0],
    'two': [4.0, 3.0, 2.0, 1.0]
}

d

In [None]:
# Create DataFrame from dict of lists
df = pd.DataFrame(d)
df

In [None]:
# Create DataFrame from dict of lists, specifying index
df = pd.DataFrame(d, index=['a', 'b', 'c', 'd'])
df

### From structured or record array

In [None]:
# Create structured data array
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data

In [None]:
# Set structured data values
data[:] = [(1, 2., 'Hello'), (2, 3., 'World')]
data

In [None]:
# Create DataFrame from structured data array
df = pd.DataFrame(data)
df

In [None]:
# Create DataFrame from structured data array, specifying index
df = pd.DataFrame(data, index=['first', 'second'])
df

In [None]:
# Create DataFrame from structured data array, specifying columns
df = pd.DataFrame(data, columns=['C', 'A', 'B'])
df

### From a list of dicts

In [None]:
# Create list of dicts
data2 = [
    {'a': 1, 'b': 2},
    {'a': 5, 'b': 10, 'c': 20}
]
data2

In [None]:
# Create DataFrame from list of dicts
df = pd.DataFrame(data2)
df

In [None]:
# Create DataFrame from list of dicts, specifying columns
pd.DataFrame(data2, columns=['a', 'b'])

### From a dict of tuples

In [None]:
# Create a tuples dictionary
d = {
    ('a', 'b'): {
        ('A', 'B'): 1,
        ('A', 'C'): 2
    },
    ('a', 'a'): {
        ('A', 'C'): 3,
        ('A', 'B'): 4
    },
    ('a', 'c'): {
        ('A', 'B'): 5,
        ('A', 'C'): 6
    },
    ('b', 'a'): {
        ('A', 'C'): 7,
        ('A', 'B'): 8
    },
    ('b', 'a'): {
        ('A', 'D'): 9,
        ('A', 'B'): 10
    }
}

d

In [None]:
# Create MultiIndex DataFrame from tuples dictionary
df = pd.DataFrame(d)
df

In [None]:
df.index

In [None]:
df.columns

### From a `Series`

In [None]:
# Create a Series
s = pd.Series(range(3), index=list('abc'), name='series')
s

In [None]:
# Create a DataFrame from a Series
df = pd.DataFrame(s)
df

### From a list of `namedtuple`s

In [None]:
# Create a namedtuple
from collections import namedtuple
Point = namedtuple('Point', 'x y')
Point(0, 0)

In [None]:
# Create DataFrame from list of namedtuples
df = pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
df

In [None]:
# Create DataFrame from mixed list of namedtuples
Point3D = namedtuple('Point3D', 'x y z')
df = pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 0), Point(2, 3)])
df

### From a list of `dataclasses`


In [None]:
# Make a dataclass
from dataclasses import make_dataclass
Point = make_dataclass('Point', [('x', int), ('y', int)])
Point(0, 0)

In [None]:
# Make a DataFrame from a list of dataclasses
df = pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
df

### Alternate constructors

In [None]:
# Make a dict of arrays
d = dict(
    [('A', [1, 2, 3]),
     ('B', [4, 5, 6])]    
)

d

In [None]:
## Make a DataFrame from a dict of arrays
df = pd.DataFrame.from_dict(d)

df

In [None]:
# Make a DataFrame from a dict of arrays, using keys as row labels and specifying column labels
df = pd.DataFrame.from_dict(d, orient='index', columns=['one', 'two', 'three'])
df

In [None]:
# Make a list of structured arrays
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])

# Set structured array values
data[:] = [(1, 2., 'Hello'), (2, 3., 'World')]
data

In [None]:
# Create a DataFrame from a list of structured arrays
df = pd.DataFrame.from_records(data)
df

## Column selection, addition, deletion

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'one': pd.Series([1.0, 2.0, 3.0], index=['a', 'b', 'c']),
    'two': pd.Series([1.0, 2.0, 3.0, 4.0], index=['a', 'b', 'c', 'd']),
})

df

In [None]:
# Access a column of a DataFrame
df['one']

In [None]:
# Add columns to a DataFrame
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2

df

In [None]:
# Delete columns from a DataFrame
del df['two']
three = df.pop('three')

df

In [None]:
# Create a column from a scalar value
df['foo'] = 'bar'
df

In [None]:
# Create a column from a Series with a different index
df['one_trunc'] = df['one'][:2]
df

In [None]:
# Insert a new column at a specific location
df.insert(1, 'bar', df['one'])
df

## Assigning new columns in method chains

In [None]:
# Create DataFrame from iris dataset
iris = pd.read_csv('data/iris.csv')
iris.head()

In [None]:
%%timeit
# Add new column derived from existing columns
iris.assign(sepal_ratio=iris['sepal_width'] / iris['sepal_length']).head()

In [None]:
%%timeit
# Add new column derived from existing columns using a lambda function
iris.assign(sepal_ratio=lambda x: (x['sepal_width'] / x['sepal_length'])).head()

In [None]:
# Create and plot new column using lambda functions
# Lambda functions in .assign() are computed on the DataFrame calling .assign()
(
    iris.query('sepal_length > 5')
    .assign(
        sepal_ratio=lambda x: x.sepal_width / x.sepal_length,
        petal_ratio=lambda x: x.petal_width / x.petal_length
    )
    .plot(kind='scatter', x='sepal_ratio', y='petal_ratio')
)

In [None]:
# Create a column dependent on existing columns, and a column dependent on the new column
dfa = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

dfa.assign(
    C = lambda x: x['A'] + x['B'],
    D = lambda x: x['A'] + x['C']
)

## Indexing/selection

- Return `Series`:
  - Select column: `df[col]`
  - Select row by label: `df.loc[label]`
  - Select row by integer location: `df.iloc[label]`

- Return `DataFrame`:
  - Slice rows: `df[5:10]`
  - Select rows by Boolean vector: `df[bool_vec]`

In [None]:
df

In [None]:
# Select a row by label
df.loc['b']

In [None]:
# Select a row by integer location
df.iloc[2]

## Data alignment and arithmetic

- `DataFrame` objects automatically align on both the columns and the index (row labels)
- Resulting objects have the union of the column and row labels of the input objects

In [None]:
# Create 10 x 4 DataFrame
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

In [None]:
# Create 7 x 3 DataFrame
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df2

In [None]:
# Add two differently-sized DataFrames
df + df2


- Default behavior of `DataFrame`-`Series` operations:
  - `Series` index is aligned to `DataFrame` columns
  - `Series` name is aligned to `DataFrame` index

In [None]:
# Show Series index
df.iloc[0].index

In [None]:
# Show Series name
df.iloc[0].name

In [None]:
# Show DataFrame columns
df.columns

In [None]:
# Show DataFrame index
df.index

In [None]:
# Perform operation between DataFrame and Series
df - df.iloc[0]

- Arithmetic scalar operations are broadcast element-wise

In [None]:
# Multiply and add to DataFrame
df * 5 + 2

In [None]:
# Divide by DataFrame
1 / df

In [None]:
# Exponentiate DataFrame
df ** 4

- Boolean operators are broadcast element-wise

In [None]:
# Create DataFrames with binary values
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)

In [None]:
# Take bitwise and of DataFrames
df1 & df2

In [None]:
# Take bitwise or of DataFrames
df1 | df2

In [None]:
# Take bitwise exclusive or of DataFrames
df1 ^ df2

In [None]:
# Take bitwise not of DataFrame
~df1

## Transposing

In [None]:
# Show original DataFrame
df

In [None]:
# Show transpose of DataFrame
df.T

## `DataFrame` interoperability with `NumPy` functions

In [None]:
# Call numpy function on DataFrame, return DataFrame
np.exp(df)

In [None]:
# Call numpy function on DataFrame, return ndarray
np.asarray(df)

- `Series` objects can be used in `NumPy` universal functions (`ufunc`s)
- The `Series` objects will be aligned on their indices before the `ufunc` is performed

In [None]:
# Create two Series with the same indexes
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s1

In [None]:
# Create two Series with the same indexes
s2 = pd.Series([1, 3, 5], index=['b', 'a', 'c'])
s2

In [None]:
# Perform universal function on Series with the same index values
np.remainder(s1, s2)

In [None]:
# Create series with different index
s3 = pd.Series([2, 4, 6], index=['b', 'c', 'd'])
s3


In [None]:
# Perform universal function on Series with different index values
np.remainder(s1, s3)

In [None]:
# Perform binary universal function on Series and Index objects
s = pd.Series([1, 2, 3])
idx = pd.Index([4, 5, 6])

np.maximum(s, idx)