### Manage data

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path 
path = Path().absolute().parent.parent.parent.parent / 'resources' / 'data' / 'bestsellers with categories.csv'

In [2]:
data = pd.read_csv(path).head(10)  # read the first 10 rows
data

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
5,A Dance with Dragons (A Song of Ice and Fire),George R. R. Martin,4.4,12643,11,2011,Fiction
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,30,2014,Fiction
7,A Gentleman in Moscow: A Novel,Amor Towles,4.7,19699,15,2017,Fiction
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,3,2018,Non Fiction
9,A Man Called Ove: A Novel,Fredrik Backman,4.6,23848,8,2016,Fiction


### Drop


In [3]:
data.drop(['Price', 'Genre'], axis=1, inplace=True)  # inplace = True : modify the original data
# axis = 0 : drop rows
# axis = 1 : drop columns

In [4]:
data.drop(range(1, 10, 2), axis=0, inplace=True)  # drop rows with odd index
data.drop_duplicates(subset=['Name'], keep='first', inplace=True)  # keep the first occurence (drop duplicates)

In [5]:
data

Unnamed: 0,Name,Author,User Rating,Reviews,Year
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,2016
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,2018
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,2019
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,2014
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,2018


### Concat

In [6]:
### add row
new_row = {'Name': 'New Book', 'Author': 'New Author', 'User Rating': 4.5, 'Reviews': 1000, 'Year': 2020, 'New Column': 1}
# data = data.append(new_row, ignore_index=True)  # add a row to the end of the data (will be deprecated in the future)
data = pd.concat([data, pd.DataFrame([new_row])])

# duplicate rows
pd.concat([data, data], ignore_index=True)  # ignore_index = True : reset index

Unnamed: 0,Name,Author,User Rating,Reviews,Year,New Column
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,2016,
1,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,2018,
2,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,2019,
3,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,2014,
4,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,2018,
5,New Book,New Author,4.5,1000,2020,1.0
6,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,2016,
7,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,2018,
8,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,2019,
9,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,2014,


### Merge

In [7]:
izq = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'], 'A' : ['A0', 'A1', 'A2','A3'], 'B': ['B0', 'B1', 'B2','B3']})
der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2','k3'], 'C' : ['C0', 'C1', 'C2','C3'], 'D': ['D0', 'D1', 'D2','D3']})

In [8]:
# izq.merge(der, on='key')  # on='column_name' : join on the key column

# left_on='column_name' : join on the key column of the left data
# right_on='column_name' : join on the key column of the right data
izq.merge(der, left_on='key', right_on='key_2')

Unnamed: 0,key,A,B,key_2,C,D
0,k0,A0,B0,k0,C0,D0
1,k1,A1,B1,k1,C1,D1
2,k2,A2,B2,k2,C2,D2
3,k3,A3,B3,k3,C3,D3


In [9]:
izq = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'],'A' : ['A0', 'A1', 'A2','A3'],'B': ['B0', 'B1', 'B2','B3']})
der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2',np.nan], 'C' : ['C0', 'C1', 'C2','C3'], 'D': ['D0', 'D1', 'D2','D3']})  # same data but with NaN

In [10]:
# izq.merge(der, left_on='key', right_on='key_2')  # Inner Join (default) == how='inner'

# how='left' : use only keys from left data
# how='right' : use only keys from right data
izq.merge(der, left_on='key', right_on='key_2', how='left')

Unnamed: 0,key,A,B,key_2,C,D
0,k0,A0,B0,k0,C0,D0
1,k1,A1,B1,k1,C1,D1
2,k2,A2,B2,k2,C2,D2
3,k3,A3,B3,,,


### Join (index matching)

In [11]:
izq = pd.DataFrame({'A': ['A0','A1','A2'], 'B':['B0','B1','B2']}, index=['k0','k1','k2'])
der =pd.DataFrame({'C': ['C0','C1','C2'], 'D':['D0','D1','D2']}, index=['k0','k2','k3']) 

In [12]:
# izq.join(der)  # Only use the index to join (how='inner')
izq.join(der, how='left')  # how='left' : use only keys from left data
izq.join(der, how='right')  # how='right' : use only keys from right data
izq.join(der, how='outer')  # how='outer' : use union of keys from both data

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k1,A1,B1,,
k2,A2,B2,C1,D1
k3,,,C2,D2


### [Apply](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html)

In [20]:
data['New Column'] = data['User Rating'].apply(lambda x: 'yesn\'t' if x > 4.7 else 'don\'t')  # apply a function to each element of a column
data

Unnamed: 0,Name,Author,User Rating,Reviews,Year,New Column
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,2016,don't
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,2018,don't
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,2019,yesn't
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,2014,don't
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,2018,don't
0,New Book,New Author,4.5,1000,2020,don't
