# pandas basic

In [1]:
import pandas as pd

## Create Dataframe

### By ndarray like

In [38]:
students = [ ('Jack', 34, 'Sydney') ,
             ('Riti', 30, 'Delhi' ) ,
             ('Aadi', 16, 'New York') ]

# create a Dataframe
df1 = pd.DataFrame(students, columns=['Name', 'Age', 'City'], index=['a', 'b', 'c'])

In [39]:
df1

Unnamed: 0,Name,Age,City
a,Jack,34,Sydney
b,Riti,30,Delhi
c,Aadi,16,New York


### By dict

In [36]:
data = {
    'Name': ['Jack', 'Riti', 'Aadi'],
    'Age': [34, 30, 16],
    'City': ['Sydney', 'Delhi', 'New York']
}
df2 = pd.DataFrame(data, index=['a', 'b', 'c'])

In [37]:
df2

Unnamed: 0,Name,Age,City
a,Jack,34,Sydney
b,Riti,30,Delhi
c,Aadi,16,New York


## Select rows and columns

### Select 1 column as a serie

In [40]:
df1['Name']

a    Jack
b    Riti
c    Aadi
Name: Name, dtype: object

In [41]:
df1.loc[:, 'Name']

a    Jack
b    Riti
c    Aadi
Name: Name, dtype: object

### Select multiple columns as a dataframe

In [43]:
df1[['Name', 'Age']]

Unnamed: 0,Name,Age
a,Jack,34
b,Riti,30
c,Aadi,16


In [44]:
df1.loc[:, ['Name', 'Age']]

Unnamed: 0,Name,Age
a,Jack,34
b,Riti,30
c,Aadi,16


### Select one row as a series

In [47]:
index = 'a'
row = df1.loc[index, :]
print(row)
print(type(row))

Name      Jack
Age         34
City    Sydney
Name: a, dtype: object
<class 'pandas.core.series.Series'>


In [51]:
df1.iloc[0, :]

Name      Jack
Age         34
City    Sydney
Name: a, dtype: object

### Select multiple rows by index as a dataframe

In [50]:
indices = ['a', 'b']
rows = df1.loc[indices, :]
rows

Unnamed: 0,Name,Age,City
a,Jack,34,Sydney
b,Riti,30,Delhi


In [52]:
df1.iloc[0:2,:]

Unnamed: 0,Name,Age,City
a,Jack,34,Sydney
b,Riti,30,Delhi


## Concatenating objects

In [85]:
df1 = pd.DataFrame(
    {
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3']
    },
    index=[1, 2, 3, 4]
)

In [86]:
df2 = pd.DataFrame(
    {
        'A': ['A4', 'A1', 'A6', 'A7'],
        'B': ['B4', 'B1', 'B6', 'B7'],
        'C': ['C4', 'C1', 'C6', 'C7'],
        'D': ['D4', 'D1', 'D6', 'D7'],
        'E': ['E4', 'E1', 'E6', 'E7']
    }
)

In [87]:
frames = [df1, df2]

In [89]:
# df3 = pd.concat(frames, axis=0, join='outer')
df3 = pd.concat(frames)
df3

Unnamed: 0,A,B,C,D,E
1,A0,B0,C0,D0,
2,A1,B1,C1,D1,
3,A2,B2,C2,D2,
4,A3,B3,C3,D3,
0,A4,B4,C4,D4,E4
1,A1,B1,C1,D1,E1
2,A6,B6,C6,D6,E6
3,A7,B7,C7,D7,E7


In [90]:
df3 =pd.concat(frames, axis=0, join='inner')
df3

Unnamed: 0,A,B,C,D
1,A0,B0,C0,D0
2,A1,B1,C1,D1
3,A2,B2,C2,D2
4,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A1,B1,C1,D1
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [92]:
df3 = pd.concat(frames, axis=1, join='outer')
df3

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,E
0,,,,,A4,B4,C4,D4,E4
1,A0,B0,C0,D0,A1,B1,C1,D1,E1
2,A1,B1,C1,D1,A6,B6,C6,D6,E6
3,A2,B2,C2,D2,A7,B7,C7,D7,E7
4,A3,B3,C3,D3,,,,,


In [93]:
df3 = pd.concat(frames, axis=1, join='inner')
df3

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,E
1,A0,B0,C0,D0,A1,B1,C1,D1,E1
2,A1,B1,C1,D1,A6,B6,C6,D6,E6
3,A2,B2,C2,D2,A7,B7,C7,D7,E7


## Merging objects

### Merging on 1 key column

In [98]:
left = pd.DataFrame(
    {
        'key': ['K0', 'K1', 'K2', 'K3'],
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3']
    }
)

right = pd.DataFrame(
    {
        'key': ['K0', 'K1', 'K2', 'K3'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3']
    }
)

In [100]:
result = pd.merge(left, right, how='inner', on='key')
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


### Merging on multiple key columns

In [102]:
left = pd.DataFrame(
    {
        'key1': ['K0', 'K0', 'K1', 'K2'],
        'key2': ['K0', 'K1', 'K0', 'K1'],
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3']
    }
)

right = pd.DataFrame(
    {
        'key1': ['K0', 'K1', 'K1', 'K2'],
        'key2': ['K0', 'K0', 'K0', 'K0'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3']
    }
)

Inner Merge

In [106]:
result = pd.merge(left, right, how='inner', on=['key1', 'key2'])
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


Outer Merge

In [107]:
result = pd.merge(left, right, how='outer', on=['key1', 'key2'])
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,
5,K2,K0,,,C3,D3


Left Merge

In [108]:
result = pd.merge(left, right, how='left', on=['key1', 'key2'])
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


Right Merge

In [109]:
result = pd.merge(left, right, how='right', on=['key1', 'key2'])
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


Validate data

In [143]:
old_df = pd.DataFrame(
    {
        'id': [1, 2, 3],
        'name': ['US', 'EU', 'Asia']
    }
)

new_df = pd.DataFrame(
    {
        'id': [1, 2, 3],
        'name': ['US', 'Germany', 'Asia']
    }
)

In [146]:
df = pd.DataFrame(columns=new_df.columns)
for index, row in old_df.iterrows():
    tmp = new_df.loc[(new_df['id'] == row['id']) & (new_df['name'] != row['name'])]
    df = df.append(tmp)
print(df)

  id     name
1  2  Germany


In [154]:
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],columns=['a','b'])
df2 = pd.DataFrame([[1,2],[5,6]],columns=['a','b'])
df3 = pd.concat([df1, df2]).drop_duplicates(keep=False)
df3

Unnamed: 0,a,b
1,3,4
