# Advanced Pandas 

The learning objectives are:
* Methods
* Missing data


In [1]:
import pandas as pd
import numpy as np

## Common dataframe operations



### To Create a Dataframe

In [25]:
# creating using dictionary
df = pd.DataFrame({'A1':['x', 'x', 'y', 'z', 'zz'],'B1': [1, 3.43, 2, 7.54, 11], 'C1':['a32', 'rty', 'xy54', 'lm54', '4.3adf'], 'D1':[123, 456, 789, -654, -321]})
df

Unnamed: 0,A1,B1,C1,D1
0,x,1.0,a32,123
1,x,3.43,rty,456
2,y,2.0,xy54,789
3,z,7.54,lm54,-654
4,zz,11.0,4.3adf,-321


### shape
`.shape` return a tuple of the dataframe dimensionality

In [14]:
print(df.shape)

df.info()


(5, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      5 non-null      object 
 1   B1      5 non-null      float64
 2   C1      5 non-null      object 
 3   D1      5 non-null      int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 288.0+ bytes


### set_index
`.set_index()` change the index to an existing column

In [15]:
df = df.set_index(df['A1'])
df

Unnamed: 0_level_0,A1,B1,C1,D1
A1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
x,x,1.0,a32,123
x,x,3.43,rty,456
y,y,2.0,xy54,789
z,z,7.54,lm54,-654
zz,zz,11.0,4.3adf,-321


### drop
`.drop()` drop specified labels row or column wise

In [16]:
df = df.drop('A1', axis=1) # axis = 1 is to drop column A1, axis = 0 is for rows
df

Unnamed: 0_level_0,B1,C1,D1
A1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
x,1.0,a32,123
x,3.43,rty,456
y,2.0,xy54,789
z,7.54,lm54,-654
zz,11.0,4.3adf,-321


### reset_index
`reset_index()` reset the index 

In [21]:
df = df.reset_index()
df

ValueError: cannot insert level_0, already exists

In [22]:
df

Unnamed: 0,level_0,index,B1,C1,D1
0,0,0,1.0,a32,123
1,1,1,3.43,rty,456
2,2,2,2.0,xy54,789
3,3,3,7.54,lm54,-654
4,4,4,11.0,4.3adf,-321


### unique
`.unique()` return an array of unique values in Series object

In [23]:
df['B1'].unique()

array([ 1.  ,  3.43,  2.  ,  7.54, 11.  ])

### nunique
`.nunique()` return number of unique values

In [26]:
df['A1'].nunique()

4

### value counts
`.value_counts()` a Series method that returns an object containing the count of all unique values

In [27]:
df['D1'].value_counts()

 123    1
 456    1
 789    1
-654    1
-321    1
Name: D1, dtype: int64

### sort values
`.sort_values()` sort dataframe along either axis

In [28]:
df.sort_values(by='C1')

Unnamed: 0,A1,B1,C1,D1
4,zz,11.0,4.3adf,-321
0,x,1.0,a32,123
3,z,7.54,lm54,-654
1,x,3.43,rty,456
2,y,2.0,xy54,789


### isnull
`.isnull()` detect missing values for an array like object, returns a boolean array, by default detects None or np.nan objects as missing

[official working with missing values pandas doc](https://pandas.pydata.org/pandas-docs/stable/missing_data.html)

In [29]:
df.isnull()

Unnamed: 0,A1,B1,C1,D1
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


### dropna
`.dropna()` remove missing values

In [30]:
df.dropna()

Unnamed: 0,A1,B1,C1,D1
0,x,1.0,a32,123
1,x,3.43,rty,456
2,y,2.0,xy54,789
3,z,7.54,lm54,-654
4,zz,11.0,4.3adf,-321


### example with missing values

In [38]:
df_nan = pd.DataFrame({'A1':[None, 'a2', 'a3', 'a4', 'a5'],'B1': [1, 3.43, np.nan, 7.54, 11], 'C1':['a32', 'rty', 'xy54', np.nan, '4.3adf'], 'D1':[123, 456, 789, -654, -321]})
df_nan

Unnamed: 0,A1,B1,C1,D1
0,,1.0,a32,123
1,a2,3.43,rty,456
2,a3,,xy54,789
3,a4,7.54,,-654
4,a5,11.0,4.3adf,-321


In [33]:
df_nan.isnull()

Unnamed: 0,A1,B1,C1,D1
0,True,False,False,False
1,False,False,False,False
2,False,True,False,False
3,False,False,True,False
4,False,False,False,False


In [34]:
df_nan.isnull().sum()

A1    1
B1    1
C1    1
D1    0
dtype: int64

In [39]:
# will drop all rows with missing values
df_nan.dropna()

Unnamed: 0,A1,B1,C1,D1
1,a2,3.43,rty,456
4,a5,11.0,4.3adf,-321


### fill in missing values
`.fillna()` fill in missing value with given object

In [40]:
df_nan.fillna('new value')

Unnamed: 0,A1,B1,C1,D1
0,new value,1.0,a32,123
1,a2,3.43,rty,456
2,a3,new value,xy54,789
3,a4,7.54,new value,-654
4,a5,11.0,4.3adf,-321


In [41]:
df_nan['B1'].fillna(df_nan['B1'].mean())

0     1.0000
1     3.4300
2     5.7425
3     7.5400
4    11.0000
Name: B1, dtype: float64