In [25]:
import numpy as np
import pandas as pd

## Missing Values

In [11]:
d = {'A':[1,2, np.nan],'B':[1,np.nan,3],'C':[1,2, np.nan]}

In [12]:
df = pd.DataFrame(d)

In [13]:
df

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,2.0,,2.0
2,,3.0,


In [14]:
df.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,1.0,1.0


In [15]:
df.dropna(axis = 1)

0
1
2


In [17]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,2.0,,2.0


In [19]:
df.fillna(value='A')

Unnamed: 0,A,B,C
0,1,1,1
1,2,A,2
2,A,3,A


In [22]:
df['A'].fillna(value=df['A'].mean(), inplace=True)

In [23]:
df

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,2.0,,2.0
2,1.5,3.0,


## Group By

In [30]:
data = {
    'Company':['GOOGLE','GOOGLE','MICROSOFT','MICROSOFT','FB','FB'],
    'PERSON':['SAM','ABBY','LOIUS','VABESSA','Carl','SARAH'],
    'SALES':[200,120,340,124,243,350]
}

In [36]:
companyDF = pd.DataFrame(data)

In [38]:
byComp = companyDF.groupby('Company')

In [39]:
byComp.mean()

Unnamed: 0_level_0,SALES
Company,Unnamed: 1_level_1
FB,296.5
GOOGLE,160.0
MICROSOFT,232.0


In [44]:
byComp.sum().loc['FB']

SALES    593
Name: FB, dtype: int64

In [47]:
companyDF.groupby('Company').count()

Unnamed: 0_level_0,PERSON,SALES
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOGLE,2,2
MICROSOFT,2,2


In [49]:
companyDF.groupby('Company').describe().T

Unnamed: 0,Company,FB,GOOGLE,MICROSOFT
SALES,count,2.0,2.0,2.0
SALES,mean,296.5,160.0,232.0
SALES,std,75.660426,56.568542,152.735065
SALES,min,243.0,120.0,124.0
SALES,25%,269.75,140.0,178.0
SALES,50%,296.5,160.0,232.0
SALES,75%,323.25,180.0,286.0
SALES,max,350.0,200.0,340.0


## Merging And Concatinating

### Preprocessing

In [50]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [51]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

In [52]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [53]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [54]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [55]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


### Concatenation

In [59]:
pd.concat([df1,df2,df3], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [77]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']},
                   index=[1,2,3,4])
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']},
                    index=[1,2,3,4])

In [78]:
pd.concat([left, right], axis=1)

Unnamed: 0,A,B,key,C,D,key.1
1,A0,B0,K0,C0,D0,K0
2,A1,B1,K1,C1,D1,K1
3,A2,B2,K2,C2,D2,K2
4,A3,B3,K3,C3,D3,K3


In [79]:
pd.merge(left, right, how='inner', on='key')

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3


## Joining

In [81]:
left.set_index('key',inplace=True)
right.set_index('key',inplace=True)
left.join(right)

Unnamed: 0_level_0,A,B,C,D
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
K0,A0,B0,C0,D0
K1,A1,B1,C1,D1
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3
