In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd


# Introduction to Python *-* pandas

---

<br>
EDAA

Albert Ruiz

## Agenda

* Introduction to pd.Series
* Introduction to pd.DataFrame
* Essential functionality
* Loading and storage

<h1 class="center_text">Introduction to pd.Series</h1>

## What is a pd.Series?

A series is a one-dimensional array-like object containing a sequence of *values* and an associated array of *labels* used as index.

In [None]:
# Default index
obj = pd.Series([10, 20, 30])
obj

# Custom labels
obj = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
obj

# Labels can be numbers and values can be strings too
obj = pd.Series(['a', 'b', 'c'], index=[10, 20, 30])
obj

## Basic selection

Compared with NumPy arrays, you can use labels when selecting values.

In [None]:
# Default indexes
obj = pd.Series([10, 20, 30])

# Single value
f"Accessing a single value: {obj[2]}"

# Set of values
f"Accessing a set of values:"
obj[1:3]

# Custom labels
obj = pd.Series([10, 20, 30], index=['a', 'b', 'c'])

# Single value
f"Accessing a single value by label: {obj['c']}"

# Set of values
f"Accessing a set of values by labels:"
obj[['b', 'c']]


## pd.Series attributes

Alls series have the following attributes:

* `dtype` - Return the dtype object of the underlying data.
* `hasnans` - Return if I have any nans; enables various perf speedups.
* `iat` - Access a single value for a row/column pair by integer position.
* `index` - The index (labels) of the Series.
* `is_monotonic` - Return True if values in the object are monotonic_increasing.
* `is_monotonic_decreasing` - Return True if values in the object are monotonic_decreasing.
* `is_unique` - Return True if values in the object are unique.
* `loc` - Access a group of rows and columns by label(s) or a boolean array.
* `ndim` - Number of dimensions of the underlying data.
* `shape` - Return a tuple of the shape of the underlying data.
* `size` - Return the number of elements in the underlying data.
* `values` - Return Series as ndarray or ndarray-like depending on the dtype.

You can find the full list of attributes in this [link](https://pandas.pydata.org/docs/reference/api/pandas.Series.html).

## Using pd.Series attributes

In [None]:
obj = pd.Series([10, 20, 30], index=['a', 'b', 'c'])

# Value and index
f"Values: {obj.values}"
f"Type: {type(obj.values)}"
f"Indexes: {obj.index}"

# Accessing
f"Accessing by integer position: {obj.iat[2]}"
f"Accessing by label: {obj.loc['c']}"

# Monotonic
f"Is monotonic: {obj.is_monotonic}"

# Unique
f"Is unique: {obj.is_unique}"

# Dimension, shape and size
f"Number of dimensions: {obj.ndim}"
f"Shape: {obj.shape}"
f"Size: {obj.size}"

## pd.Series methods

All series have the following methods:

* `abs()` - Return a Series with absolute numeric value of each element.
* `add()` - Add value to series, element-wise.
* `mul()` and `div()` - Multiply/Divide by value or series, element wise.
* `pow()` - Return exponential power of series and value or series.
* `all()` - Return whether all elements are True.
* `any()` - Return whether any elements are True.
* `append()` - Concatenate series.
* `argmax()` and `argmin()` - Return the int position of the largest/smallest value.
* `max()` and `min()` - Return the maximum/minimumb value.
* `sum()` - Return the sum of the values.
* `mean()` and `median()` - Returnthe mean and the median of the values.

You can find the full list of attributes in this [link](https://pandas.pydata.org/docs/reference/api/pandas.Series.html).

## Using pd.Series methods

In [None]:
obj = pd.Series([1, 5, 2], index=['a', 'b', 'c'])

#  Multiply by value
"Multiply by value:"
obj.mul(2)

# Multiply by series
"Multiply by series:"
obj.mul(obj)

# Sum, mean and median
f"Sum: {obj.sum()}"
f"Mean: {obj.mean()}"
f"Median: {obj.median()}"

# Findind maximum value
f"Max: {obj.max()}"
f"Max value is at: {obj.argmax()}"

# Using NumPy functions

NumPy functions and NumPy-like operations (filtering, scalar multiplication, applying maths functions...) can be used with pd.Series. Index-value links are preserved.

In [None]:
obj = pd.Series([1, 5, 2], index=['a', 'b', 'c'])

# NumPy-like operations examples
"Boolean filter:"
obj[obj >= 2]

"Element wise multiplication:"
obj * 2

# Numpy functions
"Power:"
np.power(obj, 3)

"Flip:"
np.flip(obj)

<h1 class="center_text">Introduction to pd.DataFrame</h1>

## What is a pd.DataFrame?

A DataFrame represents a rectangular table of data.

It contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).

In [None]:
data = {
    "name": ["Max", "Sarah", "John"],
    "surname": ["Rockatansky", "Connor", "McClane"],
    "sex": ["M", "F", "M"],
    "age": [35, 25, 40]
}

df = pd.DataFrame(data)
df

## Constructors

In [None]:
"From dict:"
df = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]}
)
df

"From list of lists (or list of tuples):"
df = pd.DataFrame(
    [["Max", "Rockatansky", "M", 35],
     ["Sarah", "Connor", "F", 25],
     ["John", "McClane", "M", 40]]
)
df

## Custom indexes

By default, rows are numbered. You can also define a label for each row:

In [None]:
df = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]},
    index=["a", "b", "c"]
)
df

<h1 class="center_text">Essential fucntionality</h1>

### *Note*

This section only covers functions used with pd.DataFrames. However, most of them can also be used with pd.Series.

## Reindexing

Reindexing means tocreate a new object with the data conformed to a new index.

In [None]:
"Initial dataframe:"
df_1 = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]},
    index=["a", "b", "c"]
)
df_1

"Reindexing:"
df_2 = df_1.reindex(['a', 'c', 'b'])
df_2

"Reindexing and adding a new index:"
df_3 = df_1.reindex(['a', 'c', 'd', 'b'])
df_3

## Reseting index

In [None]:
"Initial dataframe:"
df_1 = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]},
    index=["a", "b", "c"]
)
df_1

"Reseting without dropping:"
df_2 = df_1.reset_index(drop=False)
df_2

"Reseting with dropping:"
df_3 = df_1.reset_index(drop=True)
df_3

## Dropping rows and columns

In [None]:
"Initial dataframe:"
df_1 = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]},
    index=["a", "b", "c"]
)
df_1

"Dropping rows:"
df_2 = df_1.drop(["c", "b"])
df_2

"Dropping columns:"
df_3 = df_1.drop(["sex", "surname"], axis='columns')
df_3

## Indexing

Indexing into a DataFrame is for retrieveng one or more columns.

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3,4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"One column:"
df["three"]

"Multiple columns:"
df[["three", "one"]]

## Filtering: conditional indexing

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3,4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"One condition:"
df[df["two"] >= 5]

"Multiple conditions (or):"
df[(df["two"] >= 5) | (df["three"] >= 8)]

"Multiple conditions (and):"
df[(df["two"] >= 5) & (df["three"] >= 8)]

## Selection with loc

`loc` is a special indexing operator to select a subset of rows and columns by label.

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"Selecting by row:"
df.loc["italy"]

f"Selecting by pair row-col: {df.loc['italy', 'three']}"

"Selecting muliple rows and cols:"
df.loc[["france", "italy"], ["one", "four"]]

## Selecting with iloc

`iloc` is a special indexing operator to select a subset of rows and columns by position.

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"Selecting by row:"
df.iloc[1]

f"Selecting by pair row-col: {df.iloc[1, 2]}"

"Selecting muliple rows and cols:"
df.iloc[[0, 1], [0, 3]]

## Selecting with loc */* iloc with slicing

Both `loc` and `iloc` work with slices

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"loc with slicing:"
df.loc[:"italy", "two":"four"]

"iloc with slicing:"
df.iloc[:2, 1:4]

## Select single scalar with at

`at` is a special indexing operator to select a single scalar by row and column label.

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

f"Selecting single scalar: {df.at['slovakia', 'two']}"

## Select single scalar with at

`at` is a special indexing operator to select a single scalar by row and column position.

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

f"Selecting single scalar: {df.iat[2, 1]}"

## Arithmetic methods with fill values (1/4)

In [None]:
"Initial dataframes:"
df_1 = pd.DataFrame(
    np.arange(9).reshape(3, 3),
    columns=["a", "b", "c"],
)
df_1

df_2 = pd.DataFrame(
    np.arange(16).reshape(4, 4),
    columns=["a", "b", "c", "d"],
)
df_2

"Adding with operator:"
df_1 + df_2

"Adding with add() method:"
df_1.add(df_2)

## Arithmetic methods with fill values (2/4)

Other options:

* `-` operator and `sub` method for substraction
* `*` operator and `mul` method for multiplication
* `/` operator and `div` method for division
* `//` operator and `floordiv` method for floor dividion
* `**` operator and `and pow` method for exponentiation

## Arithmetic methods with fill values (3/4)

Some examples:

In [None]:
"Multiplying with operator:"
df_1 * df_2

"Multiplying with mul() method:"
df_1.mul(df_2)

## Arithmetic methods with fill values (4/4)

Arithmetic methods have the `fill_value` parameter:

In [None]:
"Multiplying with mul() method:"
df_1.mul(df_2, fill_value=0)

"Exponentiation with pow() method:"
df_1.pow(df_2, fill_value=0)

## Function application

It is possible to apply a function on one-dimensional arrays to each column or row:

In [None]:
# Dummy function
def f(x):
    return x.max() - x.min()

"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"Apply to each column:"
df.apply(f)

"Apply to each row:"
df.apply(f, axis='columns')

## Lambda functions

Functions to apply can be defined on the fly as lambda functions:

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"Apply to each column:"
df.apply(lambda x: x.max() - x.min())

"Apply to each row:"
df.apply(lambda x: x.max() - x.min(),
         axis='columns')

# Sorting by index

In [None]:
"Initial dataframe:"
df = pd.DataFrame(
    np.arange(12).reshape(3, 4),
    columns=["one", "two", "three", "four"],
    index=["france", "italy", "slovakia"]
)
df

"After sorting by row index:"
df = df.sort_index(ascending=False)
df

"After sorting by column index:"
df = df.sort_index(axis='columns')
df

## Sorting by value

In [None]:
"Intial dataframe:"
df = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]}
)
df

"Sorting by one column:"
df = df.sort_values(by="sex")
df

"Sorting by multiple columns:"
df = df.sort_values(by=["sex", "name"])
df

## Ranking

Ranking assigns ranks from one through the number of valid data points in an array.

In [None]:
"Intial dataframe:"
df = pd.DataFrame(
    {"name": ["Max", "Sarah", "John"],
     "surname": ["Rockatansky", "Connor", "McClane"],
     "sex": ["M", "F", "M"],
     "age": [35, 25, 40]}
)
df

"Ranking:"
df.rank()

In [None]:
## Loading and storage