## Make pandas DataFrame smaller and faster

In [None]:
import pandas as pd

In [None]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

In [None]:
# get more details about DataFrame
drinks.info()   # it won't calculate memory for objects

# to get full memory usage
drinks.info(memory_usage='deep')

In [None]:
# memory usage in bytes by each of the series
drinks.memory_usage(deep=True)

### Note : Objects will take more spaces or memory as compared to other data types. Integers are most memory efficient. If we could store data as int, we could reduce size.

### But then we must create a lookup table which will map our int to those data, pandas offers categories for that

In [None]:
# lets say we want to change continents as integers
# unique continents will be sorted and integers 0, 1, 2... will be assigned to them

# initial dtpe of continent will be object
# drinks.dtypes

# it will be changed to category
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes

In [None]:
# it still looks the same
drinks.head()

In [None]:
# checking memory size
drinks.memory_usage(deep=True)

### categorising by specifying logical orders

In [None]:
# creating a dataframe
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df

In [None]:
# if we want to sort by quality, by default it will sort alphabetically
df.sort_values('quality')

In [None]:
# define ordered categories
df['quality'] = df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered=True)
df.quality

# notice the categories created with logical order

In [None]:
# we can now sort on the logical order created
df.sort_values('quality')

In [None]:
# Now we can also select rows and columns as
df.loc[df.quality > 'good', :]

### Depreciated categories and ordered in astype, New way =>

In [None]:
from pandas.api.types import CategoricalDtype
qualityCategory = CategoricalDtype(['good', 'very good', 'excellent'], ordered=True)
df['quality'] = df.quality.astype(qualityCategory)
df.quality

In [None]:
df.loc[df.quality > 'very good', :]