# AUTONORMALIZE DEMO 
Using Autonormalize to normalize a kaggle dataset about food purchasing.

In [1]:
import os

import pandas as pd
import autonormalize as an

In [2]:
food_df = pd.read_csv(os.path.join(os.getcwd(), 'autonormalize/downloads/food.csv'), encoding='latin1')
food_df = food_df.drop(columns=food_df.columns[10:])
print("Rows: "+ str(food_df.shape[0]))
print("Columns: " + str(food_df.shape[1]))
food_df.head(3)

Rows: 21477
Columns: 10


Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude
0,AFG,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71
1,AFG,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71
2,AFG,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71


This dataset has 21477 rows and we've cut it down to 10 columns. As you can see, there are many data dependencies between that columns that obviously should be split up. For example, Area, Area Code, and Area Abreviation obviously should be dependent on each other.

In [3]:
deps_approx = an.find_dependencies(food_df, 0.96)

100%|██████████| 10/10 [00:02<00:00,  3.80it/s]


In [4]:
groupings = an.normalize_dependencies(deps_approx)
for grp in groupings:
    print('\n~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~\n')
    print(grp)
    print(grp.get_prim_key())


~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

 --> Area Code
 --> Item Code
 --> Item
 {Element}  --> Element Code
 {Element Code}  --> Element
['Item', 'Item Code', 'Element', 'Area Code']

~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

 {Area Abbreviation}  {Area}  --> Area Code
 {Area}  {Area Code}  --> Area Abbreviation
 {Area Abbreviation}  {Area Code}  --> Area
 {Area Abbreviation}  {Area}  {Area Code}  --> latitude
 {Area Code}  {Area}  {Area Code}  --> longitude
['Area Code']

~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

 --> latitude
 {latitude}  --> Unit
['latitude']


In [5]:
new_dfs = an.split_dataframe(food_df, groupings)

In [6]:
new_dfs[0]

Unnamed: 0,Area Code,Item Code,Item,Element Code,Element
0,2,2511,Wheat and products,5142,Food
1,2,2805,Rice (Milled Equivalent),5142,Food
2,2,2513,Barley and products,5521,Feed
3,2,2513,Barley and products,5142,Food
4,2,2514,Maize and products,5521,Feed
5,2,2514,Maize and products,5142,Food
6,2,2517,Millet and products,5142,Food
7,2,2520,"Cereals, Other",5142,Food
8,2,2531,Potatoes and products,5142,Food
9,2,2536,Sugar cane,5521,Feed


In [7]:
new_dfs[1]

Unnamed: 0,Area Abbreviation,Area Code,Area,latitude,longitude
0,ARM,1,Armenia,40.07,45.04
1,AFG,2,Afghanistan,33.94,67.71
2,ALB,3,Albania,41.15,20.17
3,DZA,4,Algeria,28.03,1.66
4,AGO,7,Angola,-11.20,17.87
5,ATG,8,Antigua and Barbuda,17.06,-61.80
6,ARG,9,Argentina,-38.42,-63.62
7,AUS,10,Australia,-25.27,133.78
8,AUT,11,Austria,47.52,14.55
9,AZE,12,Bahamas,25.03,-77.40


In [8]:
new_dfs[2]

Unnamed: 0,Unit,latitude
0,1000 tonnes,-40.90
1,1000 tonnes,-38.42
2,1000 tonnes,-35.68
3,1000 tonnes,-32.52
4,1000 tonnes,-30.56
5,1000 tonnes,-29.61
6,1000 tonnes,-26.52
7,1000 tonnes,-25.27
8,1000 tonnes,-23.44
9,1000 tonnes,-22.96
