In [None]:
%load_ext nb_black

# Data types

## Setup

In [None]:
import pandas as pd

## Creation

Creation of an example DataFrame (starting from a dictionary of dictionaries):

In [None]:
data = {
    "Capital": {
        "Spain": "Madrid",
        "Belgium": "Brussels",
        "France": "Paris",
        "Italy": "Roma",
        "Germany": "Berlin",
        "Portugal": "Lisbon",
        "Norway": "Oslo",
        "Greece": "Athens",
    },
    "Population": {
        "Spain": 46733038,
        "Belgium": 11449656,
        "France": 67076000,
        "Italy": 60390560,
        "Germany": 83122889,
        "Portugal": 10295909,
        "Norway": 5391369,
        "Greece": 10718565,
    },
    "Monarch": {
        "Spain": "Felipe VI",
        "Belgium": "Philippe",
        "Norway": "Harald V",
    },
    "Area": {
        "Spain": 505990,
        "Belgium": 30688,
        "France": 640679,
        "Italy": 301340,
        "Germany": None,
        "Portugal": 92212,
        "Norway": 385207,
        "Greece": 131957,
    },
    "Currency": {
        "Spain": "EUR",
        "Belgium": "EUR",
        "France": "EUR",
        "Italy": "EUR",
        "Germany": "EUR",
        "Portugal": None,
        "Norway": "NOK",
        "Greece": "EUR",
    },
    "Formation": {
        "Spain": "1715-06-09",
        "Belgium": "1830-10-04",
        "France": "1792-09-22",
        "Italy": None,
        "Germany": None,
        "Portugal": None,
        "Norway": None,
        "Greece": None,
    },
}

In [None]:
# For now, let's forget about these steps:
df = pd.DataFrame(data)

Apple stock data, taken from the [`matplotlib` sample datasets](https://github.com/matplotlib/sample_data/blob/master/aapl.csv)

In [None]:
# For now, let's forget about these steps:
apple = pd.read_csv("AAPL.csv")
# apple["Date"] = apple["Date"].astype("datetime64[ns]")
# apple = apple.set_index("Date")
# apple = apple.sort_index()

## Demo 1: Check the memory usage

In [None]:
df

Check the memory usage of a DataFrame:

In [None]:
memory_kb = df.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")

In [None]:
memory_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

<div class="alert alert-info">

<b>Note:</b> The memory usage should be <b>at most 10% of the available RAM</b> for <code>pandas</code> to remain fast.

</div>

## Exercise 1

In [None]:
apple.head()

Check the memory usage of a DataFrame:

## Demo 2: Change the data type of a column

In [None]:
df

Add a new "Density" column:

In [None]:
df["Density"] = df["Population"] / df["Area"]

Check the data types:

In [None]:
df.dtypes

<div class="alert alert-info">

<b>Note:</b> If there are missing values in a column of integers, <code>pandas</code> defaults to using floats (which support <code>NaN</code>, used for missing values).

</div>

<div class="alert alert-success">

<b>Best Practice:</b> Always use integers over floats whenever possible.

</div>

Change the data type of a column:

In [None]:
df["Area"]

In [None]:
df["Area"].astype("Int64")

In [None]:
df["Area"] = df["Area"].astype("Int64")

In [None]:
df

In [None]:
df.dtypes

In [None]:
df["Capital"] = df["Capital"].astype("string")

In [None]:
df["Monarch"] = df["Monarch"].astype("string")

In [None]:
df

In [None]:
df.dtypes

In [None]:
df["Currency"] = df["Currency"].astype("category")

In [None]:
df

In [None]:
df.dtypes

In [None]:
df["Formation"] = df["Formation"].astype("datetime64[ns]")

In [None]:
df

In [None]:
df.dtypes

<div class="alert alert-success">

<b>Best Practice:</b> Always use the following data types:
    <ul>
        <li>For strings: 
            <ul>
                <li><b>category</b> (<code>.astype("category")</code>) if possible;</li>
                <li><b>string</b> (<code>.astype("string")</code>) otherwise;</li>
            </ul>
        <li>For integers:
            <ul>
                <li><b>int64</b> (<code>.astype("int")</code>) if there are no missing values;</li>
                <li><b>Int64</b> (<code>.astype("Int64")</code>) otherwise;</li>
            </ul>
        <li>For floast: <b>float64</b> (<code>.astype("float")</code>)</li>
        <li>For dates: <b>datetime64[ns]</b> (<code>.astype("datetime64[ns]")</code>)</li>
    </ul>
</div>

<div class="alert alert-info">

<b>Note:</b> Missing values are represented with either <code>NaN</code>, <code>NaT</code>, or <code>&#60;NA&#62;</code>, depending on the data type of the column.

</div>

## Exercise 2

In [None]:
apple.head()

Check the data types of the `apple` DataFrame:

Correct the data type of the "Date" column:

Check the data types of the `apple` DataFrame:

## Demo 3: Check the memory saved

Save the original memory usage:

In [None]:
memory_kb_original = memory_kb
memory_mb_original = memory_mb

Check the memory usage of a DataFrame:

In [None]:
memory_kb = df.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")

In [None]:
memory_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

Calculate the memory saved:

In [None]:
print(f"The memory was reduced by a factor of {memory_kb_original / memory_kb:.1f}")

In [None]:
print(f"The memory was reduced by a factor of {memory_mb_original / memory_mb:.1f}")

<div class="alert alert-info">

<b>Note:</b> The memory usage should be <b>at most 10% of the available RAM</b> for <code>pandas</code> to remain fast.

</div>

## Exercise 3

In [None]:
apple.head()

Save the original memory usage:

Check the memory usage of a DataFrame:

Calculate the memory saved:

## Demo 4: `.str` accessor

In [None]:
name = "JC"

In [None]:
name.lower()

In [None]:
df

Check the data types:

In [None]:
df.dtypes

Modify a string column thanks to the `.str` accessor:

In [None]:
df["Capital"]

In [None]:
df["Capital"].str

In [None]:
df["Capital"].str.upper()

## Exercise 4

In [None]:
apple.head()

Check the data types:

Identify a column stored as `datetime`:

While `string` columns have a `.str` accessor, `datetime` columns have a `.dt` accessor:

Find a method of the `.dt` accessor that returns the date in the ISO format: