In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Data types

## Setup

In [2]:
import pandas as pd

<IPython.core.display.Javascript object>

## Creation

Creation of an example DataFrame (starting from a dictionary of dictionaries):

In [59]:
data = {
    "Capital": {
        "Spain": "Madrid",
        "Belgium": "Brussels",
        "France": "Paris",
        "Italy": "Roma",
        "Germany": "Berlin",
        "Portugal": "Lisbon",
        "Norway": "Oslo",
        "Greece": "Athens",
    },
    "Population": {
        "Spain": 46733038,
        "Belgium": 11449656,
        "France": 67076000,
        "Italy": 60390560,
        "Germany": 83122889,
        "Portugal": 10295909,
        "Norway": 5391369,
        "Greece": 10718565,
    },
    "Monarch": {
        "Spain": "Felipe VI",
        "Belgium": "Philippe",
        "Norway": "Harald V",
    },
    "Area": {
        "Spain": 505990,
        "Belgium": 30688,
        "France": 640679,
        "Italy": 301340,
        "Germany": None,
        "Portugal": 92212,
        "Norway": 385207,
        "Greece": 131957,
    },
    "Currency": {
        "Spain": "EUR",
        "Belgium": "EUR",
        "France": "EUR",
        "Italy": "EUR",
        "Germany": "EUR",
        "Portugal": None,
        "Norway": "NOK",
        "Greece": "EUR",
    },
    "Formation": {
        "Spain": "1715-06-09",
        "Belgium": "1830-10-04",
        "France": "1792-09-22",
        "Italy": None,
        "Germany": None,
        "Portugal": None,
        "Norway": None,
        "Greece": None,
    },
}

<IPython.core.display.Javascript object>

In [60]:
# For now, let's forget about these steps:
df = pd.DataFrame(data)

<IPython.core.display.Javascript object>

Apple stock data, taken from the [`matplotlib` sample datasets](https://github.com/matplotlib/sample_data/blob/master/aapl.csv)

In [39]:
# For now, let's forget about these steps:
apple = pd.read_csv("AAPL.csv")
# apple["Date"] = apple["Date"].astype("datetime64[ns]")
# apple = apple.set_index("Date")
# apple = apple.sort_index()

<IPython.core.display.Javascript object>

## Demo 1: Check the memory usage

In [6]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04
France,Paris,67076000,,640679.0,EUR,1792-09-22
Italy,Roma,60390560,,301340.0,EUR,
Germany,Berlin,83122889,,,EUR,
Portugal,Lisbon,10295909,,92212.0,,
Norway,Oslo,5391369,Harald V,385207.0,NOK,
Greece,Athens,10718565,,131957.0,EUR,


<IPython.core.display.Javascript object>

Check the memory usage of a DataFrame:

In [7]:
memory_kb = df.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")

Memory used: 2.2 kB


<IPython.core.display.Javascript object>

In [8]:
memory_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

Memory used: 0.0 MB


<IPython.core.display.Javascript object>

<div class="alert alert-info">

<b>Note:</b> The memory usage should be <b>at most 10% of the available RAM</b> for <code>pandas</code> to remain fast.

</div>

## Exercise 1

In [9]:
apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2008-10-14,116.26,116.4,103.14,104.08,70749800,104.08
1,2008-10-13,104.55,110.53,101.02,110.26,54967000,110.26
2,2008-10-10,85.7,100.0,85.0,96.8,79260700,96.8
3,2008-10-09,93.35,95.8,86.6,88.74,57763700,88.74
4,2008-10-08,85.91,96.33,85.68,89.79,78847900,89.79


<IPython.core.display.Javascript object>

Check the memory usage of a DataFrame:

In [40]:
memory_kb = apple.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")
memory_mb = apple.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

Memory used: 683.0 kB
Memory used: 0.7 MB


<IPython.core.display.Javascript object>

## Demo 2: Change the data type of a column

In [12]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04
France,Paris,67076000,,640679.0,EUR,1792-09-22
Italy,Roma,60390560,,301340.0,EUR,
Germany,Berlin,83122889,,,EUR,
Portugal,Lisbon,10295909,,92212.0,,
Norway,Oslo,5391369,Harald V,385207.0,NOK,
Greece,Athens,10718565,,131957.0,EUR,


<IPython.core.display.Javascript object>

Add a new "Density" column:

In [13]:
df["Density"] = df["Population"] / df["Area"]

<IPython.core.display.Javascript object>

Check the data types:

In [14]:
df.dtypes

Capital        object
Population      int64
Monarch        object
Area          float64
Currency       object
Formation      object
Density       float64
dtype: object

<IPython.core.display.Javascript object>

<div class="alert alert-info">

<b>Note:</b> If there are missing values in a column of integers, <code>pandas</code> defaults to using floats (which support <code>NaN</code>, used for missing values).

</div>

<div class="alert alert-success">

<b>Best Practice:</b> Always use integers over floats whenever possible.

</div>

Change the data type of a column:

In [15]:
df["Area"]

Spain       505990.0
Belgium      30688.0
France      640679.0
Italy       301340.0
Germany          NaN
Portugal     92212.0
Norway      385207.0
Greece      131957.0
Name: Area, dtype: float64

<IPython.core.display.Javascript object>

In [16]:
df["Area"].astype("Int64")

Spain       505990
Belgium      30688
France      640679
Italy       301340
Germany       <NA>
Portugal     92212
Norway      385207
Greece      131957
Name: Area, dtype: Int64

<IPython.core.display.Javascript object>

In [17]:
df["Area"] = df["Area"].astype("Int64")

<IPython.core.display.Javascript object>

In [18]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation,Density
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09,92.359608
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04,373.098801
France,Paris,67076000,,640679.0,EUR,1792-09-22,104.695175
Italy,Roma,60390560,,301340.0,EUR,,200.406717
Germany,Berlin,83122889,,,EUR,,
Portugal,Lisbon,10295909,,92212.0,,,111.654763
Norway,Oslo,5391369,Harald V,385207.0,NOK,,13.996031
Greece,Athens,10718565,,131957.0,EUR,,81.227711


<IPython.core.display.Javascript object>

In [19]:
df.dtypes

Capital        object
Population      int64
Monarch        object
Area            Int64
Currency       object
Formation      object
Density       float64
dtype: object

<IPython.core.display.Javascript object>

In [20]:
df["Capital"] = df["Capital"].astype("string")

<IPython.core.display.Javascript object>

In [21]:
df["Monarch"] = df["Monarch"].astype("string")

<IPython.core.display.Javascript object>

In [22]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation,Density
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09,92.359608
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04,373.098801
France,Paris,67076000,,640679.0,EUR,1792-09-22,104.695175
Italy,Roma,60390560,,301340.0,EUR,,200.406717
Germany,Berlin,83122889,,,EUR,,
Portugal,Lisbon,10295909,,92212.0,,,111.654763
Norway,Oslo,5391369,Harald V,385207.0,NOK,,13.996031
Greece,Athens,10718565,,131957.0,EUR,,81.227711


<IPython.core.display.Javascript object>

In [23]:
df.dtypes

Capital        string
Population      int64
Monarch        string
Area            Int64
Currency       object
Formation      object
Density       float64
dtype: object

<IPython.core.display.Javascript object>

In [24]:
df["Currency"] = df["Currency"].astype("category")

<IPython.core.display.Javascript object>

In [25]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation,Density
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09,92.359608
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04,373.098801
France,Paris,67076000,,640679.0,EUR,1792-09-22,104.695175
Italy,Roma,60390560,,301340.0,EUR,,200.406717
Germany,Berlin,83122889,,,EUR,,
Portugal,Lisbon,10295909,,92212.0,,,111.654763
Norway,Oslo,5391369,Harald V,385207.0,NOK,,13.996031
Greece,Athens,10718565,,131957.0,EUR,,81.227711


<IPython.core.display.Javascript object>

In [26]:
df.dtypes

Capital         string
Population       int64
Monarch         string
Area             Int64
Currency      category
Formation       object
Density        float64
dtype: object

<IPython.core.display.Javascript object>

In [27]:
df["Formation"] = df["Formation"].astype("datetime64[ns]")

<IPython.core.display.Javascript object>

In [28]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation,Density
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09,92.359608
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04,373.098801
France,Paris,67076000,,640679.0,EUR,1792-09-22,104.695175
Italy,Roma,60390560,,301340.0,EUR,NaT,200.406717
Germany,Berlin,83122889,,,EUR,NaT,
Portugal,Lisbon,10295909,,92212.0,,NaT,111.654763
Norway,Oslo,5391369,Harald V,385207.0,NOK,NaT,13.996031
Greece,Athens,10718565,,131957.0,EUR,NaT,81.227711


<IPython.core.display.Javascript object>

In [29]:
df.dtypes

Capital               string
Population             int64
Monarch               string
Area                   Int64
Currency            category
Formation     datetime64[ns]
Density              float64
dtype: object

<IPython.core.display.Javascript object>

<div class="alert alert-success">

<b>Best Practice:</b> Always use the following data types:
    <ul>
        <li>For strings: 
            <ul>
                <li><b>category</b> (<code>.astype("category")</code>) if possible;</li>
                <li><b>string</b> (<code>.astype("string")</code>) otherwise;</li>
            </ul>
        <li>For integers:
            <ul>
                <li><b>int64</b> (<code>.astype("int")</code>) if there are no missing values;</li>
                <li><b>Int64</b> (<code>.astype("Int64")</code>) otherwise;</li>
            </ul>
        <li>For floats: <b>float64</b> (<code>.astype("float")</code>)</li>
        <li>For dates: <b>datetime64[ns]</b> (<code>.astype("datetime64[ns]")</code>)</li>
    </ul>
</div>

<div class="alert alert-info">

<b>Note:</b> Missing values are represented with either <code>NaN</code>, <code>NaT</code>, or <code>&#60;NA&#62;</code>, depending on the data type of the column.

</div>

## Exercise 2

In [30]:
apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2008-10-14,116.26,116.4,103.14,104.08,70749800,104.08
1,2008-10-13,104.55,110.53,101.02,110.26,54967000,110.26
2,2008-10-10,85.7,100.0,85.0,96.8,79260700,96.8
3,2008-10-09,93.35,95.8,86.6,88.74,57763700,88.74
4,2008-10-08,85.91,96.33,85.68,89.79,78847900,89.79


<IPython.core.display.Javascript object>

Check the data types of the `apple` DataFrame:

In [31]:
apple.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object

<IPython.core.display.Javascript object>

Correct the data type of the "Date" column:

In [42]:
apple.Date = apple.Date.astype("datetime64[ns]")

<IPython.core.display.Javascript object>

Check the data types of the `apple` DataFrame:

In [33]:
apple.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Volume                int64
Adj Close           float64
dtype: object

<IPython.core.display.Javascript object>

## Demo 3: Check the memory saved

Save the original memory usage:

In [34]:
memory_kb_original = memory_kb
memory_mb_original = memory_mb

<IPython.core.display.Javascript object>

Check the memory usage of a DataFrame:

In [35]:
memory_kb = df.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")

Memory used: 2.2 kB


<IPython.core.display.Javascript object>

In [36]:
memory_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

Memory used: 0.0 MB


<IPython.core.display.Javascript object>

Calculate the memory saved:

In [37]:
print(f"The memory was reduced by a factor of {memory_kb_original / memory_kb:.1f}")

The memory was reduced by a factor of 317.5


<IPython.core.display.Javascript object>

In [38]:
print(f"The memory was reduced by a factor of {memory_mb_original / memory_mb:.1f}")

The memory was reduced by a factor of 317.5


<IPython.core.display.Javascript object>

<div class="alert alert-info">

<b>Note:</b> The memory usage should be <b>at most 10% of the available RAM</b> for <code>pandas</code> to remain fast.

</div>

## Exercise 3

In [None]:
apple.head()

Save the original memory usage:

In [41]:
memory_kb_original = memory_kb
memory_mb_original = memory_mb

<IPython.core.display.Javascript object>

Check the memory usage of a DataFrame:

In [46]:
memory_kb = apple.memory_usage(deep=True).sum() / (1024 ** 1)
print(f"Memory used: {memory_kb:.1f} kB")
memory_mb = apple.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory used: {memory_mb:.1f} MB")

Memory used: 332.7 kB
Memory used: 0.3 MB


<IPython.core.display.Javascript object>

Calculate the memory saved:

In [47]:
print(f"The memory was reduced by a factor of {memory_kb_original / memory_kb:.1f}")

The memory was reduced by a factor of 2.1


<IPython.core.display.Javascript object>

In [48]:
print(f"The memory was reduced by a factor of {memory_mb_original / memory_mb:.1f}")

The memory was reduced by a factor of 2.1


<IPython.core.display.Javascript object>

## Demo 4: `.str` accessor

In [52]:
name = "Bea"

<IPython.core.display.Javascript object>

In [53]:
name.lower()

'bea'

<IPython.core.display.Javascript object>

In [54]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Currency,Formation,Density
Spain,Madrid,46733038,Felipe VI,505990.0,EUR,1715-06-09,92.359608
Belgium,Brussels,11449656,Philippe,30688.0,EUR,1830-10-04,373.098801
France,Paris,67076000,,640679.0,EUR,1792-09-22,104.695175
Italy,Roma,60390560,,301340.0,EUR,NaT,200.406717
Germany,Berlin,83122889,,,EUR,NaT,
Portugal,Lisbon,10295909,,92212.0,,NaT,111.654763
Norway,Oslo,5391369,Harald V,385207.0,NOK,NaT,13.996031
Greece,Athens,10718565,,131957.0,EUR,NaT,81.227711


<IPython.core.display.Javascript object>

Check the data types:

In [55]:
df.dtypes

Capital               string
Population             int64
Monarch               string
Area                   Int64
Currency            category
Formation     datetime64[ns]
Density              float64
dtype: object

<IPython.core.display.Javascript object>

Modify a string column thanks to the `.str` accessor:

In [61]:
df["Capital"]

Spain         Madrid
Belgium     Brussels
France         Paris
Italy           Roma
Germany       Berlin
Portugal      Lisbon
Norway          Oslo
Greece        Athens
Name: Capital, dtype: object

<IPython.core.display.Javascript object>

In [62]:
df["Capital"].str

<pandas.core.strings.accessor.StringMethods at 0x26827e07580>

<IPython.core.display.Javascript object>

In [63]:
df["Capital"].str.upper()

Spain         MADRID
Belgium     BRUSSELS
France         PARIS
Italy           ROMA
Germany       BERLIN
Portugal      LISBON
Norway          OSLO
Greece        ATHENS
Name: Capital, dtype: object

<IPython.core.display.Javascript object>

## Exercise 4

In [65]:
apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2008-10-14,116.26,116.4,103.14,104.08,70749800,104.08
1,2008-10-13,104.55,110.53,101.02,110.26,54967000,110.26
2,2008-10-10,85.7,100.0,85.0,96.8,79260700,96.8
3,2008-10-09,93.35,95.8,86.6,88.74,57763700,88.74
4,2008-10-08,85.91,96.33,85.68,89.79,78847900,89.79


<IPython.core.display.Javascript object>

Check the data types:

In [66]:
apple.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Volume                int64
Adj Close           float64
dtype: object

<IPython.core.display.Javascript object>

Identify a column stored as `datetime`:

In [67]:
apple.Date

0      2008-10-14
1      2008-10-13
2      2008-10-10
3      2008-10-09
4      2008-10-08
          ...    
6076   1984-09-13
6077   1984-09-12
6078   1984-09-11
6079   1984-09-10
6080   1984-09-07
Name: Date, Length: 6081, dtype: datetime64[ns]

<IPython.core.display.Javascript object>

While `string` columns have a `.str` accessor, `datetime` columns have a `.dt` accessor:

In [68]:
apple.Date.dt

<pandas.core.indexes.accessors.DatetimeProperties object at 0x0000026827E048B0>

<IPython.core.display.Javascript object>

Find a method of the `.dt` accessor that returns the date in the ISO format:

In [71]:
apple.Date.dt.isocalendar()

Unnamed: 0,year,week,day
0,2008,42,2
1,2008,42,1
2,2008,41,5
3,2008,41,4
4,2008,41,3
...,...,...,...
6076,1984,37,4
6077,1984,37,3
6078,1984,37,2
6079,1984,37,1


<IPython.core.display.Javascript object>

In [73]:
apple[apple.Date.dt.month == 2]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
158,2008-02-29,129.29,130.21,124.80,125.02,44838600,125.02
159,2008-02-28,127.20,132.20,125.77,129.91,57794800,129.91
160,2008-02-27,118.23,123.05,118.09,122.96,52683500,122.96
161,2008-02-26,117.64,121.09,115.44,119.15,53746000,119.15
162,2008-02-25,118.59,120.17,116.66,119.74,44884800,119.74
...,...,...,...,...,...,...,...
5974,1985-02-07,30.00,30.37,29.87,29.87,8793600,3.41
5975,1985-02-06,30.00,30.00,30.00,30.00,6980000,3.42
5976,1985-02-05,29.50,30.00,29.50,29.50,6824800,3.37
5977,1985-02-04,29.25,29.37,29.25,29.25,7801600,3.34


<IPython.core.display.Javascript object>