In [1]:
import pandas as pd

In [2]:
drinks = pd.read_csv("http://bit.ly/drinksbycountry")

In [3]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [5]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


#### It tells the memory usage without looking at the object type fields

In [7]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


#### It tells the actual memory usage

In [9]:
# For getting memory usage for individual fields in bytes

drinks.memory_usage()

Index                            128
country                         1544
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                       1544
dtype: int64

In [10]:
#For getting actual memory usage including object fields

drinks.memory_usage(deep = True)

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

#### Note: Here the object fields are using more memory than int & float fields

In [11]:
drinks.memory_usage(deep = True).sum()

31224

# How to reduce memory usage in data frame?

#### This can be done by storing object fields into integer, so the memory usage will be efficient

In [12]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [13]:
drinks['continent'] = drinks.continent.astype('category')

In [14]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   country                       193 non-null    object  
 1   beer_servings                 193 non-null    int64   
 2   spirit_servings               193 non-null    int64   
 3   wine_servings                 193 non-null    int64   
 4   total_litres_of_pure_alcohol  193 non-null    float64 
 5   continent                     193 non-null    category
dtypes: category(1), float64(1), int64(3), object(1)
memory usage: 8.1+ KB


#### Now the dtype is changed to category and the memory usage is lesser than before

In [16]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

But it still looks the same with six categories mentioned there

In [17]:
# We can see their real storage in int type 
# It is similar to string method so using cat attribute additionally

drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [19]:
# Actual memory usage 

drinks.memory_usage(deep=True)

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

#### The continent memory usage is lesser than country field's usage as it is still in object type

In [20]:
drinks.country = drinks.country.astype('category')

In [21]:
drinks.memory_usage(deep = True)

Index                             128
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [22]:
drinks.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

#### As we can see the memory usage of country field is greater than before, as it's having 193 unique values

#### So use only categorical transformation if the fields is having few unique values

In [38]:
df = pd.DataFrame({'Id':[10,11,12,13],'quality':['good','very good','good','execellent']})

In [39]:
df.head()

Unnamed: 0,Id,quality
0,10,good
1,11,very good
2,12,good
3,13,execellent


In [40]:
df.sort_values('quality')

Unnamed: 0,Id,quality
3,13,execellent
0,10,good
2,12,good
1,11,very good


### Here we see the quality field is alphabetically ordered , what if we want to order them logically ?

#### Like good << very good << excellent

In [41]:
cat_dtype = pd.api.types.CategoricalDtype(categories=['good','very good','excellent'], ordered=True)

df['quality'] = df.quality.astype(cat_dtype)

In [42]:
df.quality

0         good
1    very good
2         good
3          NaN
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

#### Now it tells the order of this categorical field

In [43]:
# If we sort this field, it sorts according to the logical order provided

df.sort_values('quality').head(10)

Unnamed: 0,Id,quality
0,10,good
2,12,good
1,11,very good
3,13,


In [46]:
# We can also do comparison in this order 

df.loc[df['quality']<'excellent',:]

Unnamed: 0,Id,quality
0,10,good
1,11,very good
2,12,good
