### Data Transformation using .groupby().transform

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
res_data = pd.read_csv("E:/DataCamp/Writing efficient code with pandas/restaurant_data.csv")

In [4]:
res_data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
res_data.groupby(['smoker']).count()

Unnamed: 0_level_0,total_bill,tip,sex,day,time,size
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,151,151,151,151,151,151
Yes,93,93,93,93,93,93


### Data Transformation

#### 1. Z score normalisation / transformation

In [6]:
zscore = lambda x: (x - x.mean()) / x.std()

In [7]:
res_transformed = res_data.groupby(['time']).transform(zscore)

In [10]:
res_transformed.head()

Unnamed: 0,total_bill,tip,size
0,-0.416446,-1.457045,-0.692873
1,-1.143855,-1.004475,0.405737
2,0.023282,0.276645,0.405737
3,0.315339,0.144355,-0.692873
4,0.41488,0.353234,1.504347


In [48]:
res_data.shape

(244, 7)

In [49]:
res_transformed.shape

(244, 3)

In [52]:
#Checking whether the standard deviation and mean are 1 and 0
res_check = res_transformed.groupby(res_data['time'])
np.round(res_check.mean(),3)

Unnamed: 0_level_0,total_bill,tip,size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,-0.0,0.0,-0.0
Lunch,0.0,0.0,0.0


In [51]:
res_check.std()

Unnamed: 0_level_0,total_bill,tip,size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,1.0,1.0,1.0
Lunch,1.0,1.0,1.0


#### 2. min-max normalisation / transformation

In [None]:
#For example, to rescale student's weight data spanning from 160 pounds to 200 pounds, 
#you subtract 160 from each student's weight and divide the result by 40 (200 - 160)

In [13]:
min_max_tr = lambda x: (x - x.min()) / (x.max() - x.min())
res_min_max_transformed = res_data.groupby(['time']).transform(min_max_tr)
res_min_max_transformed.head()

Unnamed: 0,total_bill,tip,size
0,0.291579,0.001111,0.2
1,0.152283,0.073333,0.4
2,0.375786,0.277778,0.4
3,0.431713,0.256667,0.2
4,0.450775,0.29,0.6


#### 3. transforming values to probabilities

In [None]:
#apply a probability distribution function to a pandas DataFrame with group related parameters 
#by transforming the tip variable to probabilities
#The transformation will be a exponential transformation. The exponential distribution is defined as
# λ * e−(λ∗x)
#where λ (lambda) is the mean of the group that the observation x belongs to

In [16]:
exp_tr = lambda x: np.exp(-x * x.mean()) * x.mean()
res_exp_transformed = res_data.groupby(['time'])['tip'].transform(exp_tr)
res_exp_transformed.head()

0    0.135141
1    0.017986
2    0.000060
3    0.000108
4    0.000042
Name: tip, dtype: float64

### Missing value imputation

In [None]:
res_data[pd.isna(res_data) == True]

#### Filling missing value with mean

In [None]:
missing_tr = lambda x: x.fillna(x.mean())
#x.median can also be used

In [None]:
res_data_missing_filled = res_data.groupby(['class'])['total_bill'].transform(missing_tr)

### Data Filtration

In [None]:
filter_tr = lambda x: x['total_bill'].mean() > 20
res_data_filtered = res_data.groupby('day').filter(filter_tr)

#Uses
#filtering based on number of missing values for a feature
#filtering based upon mean value for a feature

### Identify Columns with NA values

In [14]:
df = pd.DataFrame({"name": ['Abhishek', 'Shrasti', 'Anvit']
                  ,"physics": [99, np.NaN, 95]
                  ,"maths": [100, 60, 99] })

In [15]:
#Rows with null values
df[df.isnull().any(axis=1)]

Unnamed: 0,name,physics,maths
1,Shrasti,,60


In [None]:
#Which columns has null values
df.isnull().sum().sort_values(ascending=False)