## While analysing data we often face a problem of missing values in our data. Let's see how can we deal with that in python with pandas library.

In [1]:
import numpy as np
import pandas as pd

In [2]:
d={'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}

In [3]:
df=pd.DataFrame(d)

In [4]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [5]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [6]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [7]:
#it keeps row 1, as it has at lest 2 not NaN entry
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [8]:
#replacing missing values
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [9]:
#replacing with a string
df.fillna(value="value")

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,value,2
2,value,value,3


# Groupby
## Using group by method to light a group rows of data together and call aggregate functions
### Groupby allows you to group together rows based off of a column and perform an aggregate function on them

In [10]:
import numpy as np
import pandas as pd

In [11]:
data={'Company':['Gp','Gp','Teletalk','Teletal','Robi','Robi'],
                'Person':['Alam','Mim','Tina','Rimi','Rafi','Pranto'],
                'Sales':[200,170,225,439,410,324]}

In [12]:
df=pd.DataFrame(data)

In [13]:
df

Unnamed: 0,Company,Person,Sales
0,Gp,Alam,200
1,Gp,Mim,170
2,Teletalk,Tina,225
3,Teletal,Rimi,439
4,Robi,Rafi,410
5,Robi,Pranto,324


In [14]:
df_new=df.groupby('Company')

In [15]:
df_new.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Gp,185.0
Robi,367.0
Teletal,439.0
Teletalk,225.0


In [16]:
df_new.sum
df_new.std

<bound method GroupBy.std of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdcb79ea790>>

In [17]:
df_new.sum().loc['Gp']

Sales    370
Name: Gp, dtype: int64

In [18]:
df.groupby('Company').sum().loc['Gp']

Sales    370
Name: Gp, dtype: int64

In [19]:
df.groupby('Company').count() 

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Gp,2,2
Robi,2,2
Teletal,1,1
Teletalk,1,1


In [20]:
df.groupby('Company').max() 

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Gp,Mim,200
Robi,Rafi,410
Teletal,Rimi,439
Teletalk,Tina,225


In [21]:
df.groupby('Company').describe().transpose()['Gp']

Sales  count      2.000000
       mean     185.000000
       std       21.213203
       min      170.000000
       25%      177.500000
       50%      185.000000
       75%      192.500000
       max      200.000000
Name: Gp, dtype: float64