# ML Preprocess
## Contents
- Load Data
- Data Information
- Classifying catagorical and numerican datasets
- Data Wrangling
- Label Encoding
- Scale Data
    - Standard Scaler
    - Normalize
- Save Clean Data (data versions)

# Load Data

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, LabelEncoder
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing script modules from ../scripts
# Adding scripts path
import sys, os
sys.path.insert(0, '../scripts/')
from data_preProcessing import data_preProcessing_script
from data_manipulator import DataCleaner
from data_exploration import exploration

In [3]:
df = pd.read_csv("../data/processed_incl_missing.csv")
df.head()

Unnamed: 0,game_key,campaign_id,type,width,height,creative_id,auction_id,geo_country,site_name,platform_os,...,Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Agency Fee,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,fd74243f-6606-4830-a0ef-dd12f66ec6f5,Thailand,www.wuxiaworld.com,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-01,19:55:20.291000
1,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,fd74243f-6606-4830-a0ef-dd12f66ec6f5,Thailand,www.wuxiaworld.com,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-01,19:55:20.298000
2,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,tf2htrrm,b3af878b-fd1a-4c6f-91a2-4e3670d2fda5,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-05,00:21:39.693000
3,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,b7c22590-e784-43cf-874d-a661ad99601f,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-06,09:30:44.188000
4,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,b7c22590-e784-43cf-874d-a661ad99601f,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-06,09:30:45.047000


# Data Information

In [4]:
preprocess = data_preProcessing_script(df)
preprocess.show_data_information()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99946 entries, 0 to 99945
Data columns (total 32 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   game_key                             99946 non-null  object 
 1   campaign_id                          99946 non-null  object 
 2   type                                 99946 non-null  object 
 3   width                                99946 non-null  int64  
 4   height                               99944 non-null  object 
 5   creative_id                          99946 non-null  object 
 6   auction_id                           99946 non-null  object 
 7   geo_country                          99946 non-null  object 
 8   site_name                            99946 non-null  object 
 9   platform_os                          99946 non-null  int64  
 10  device_type                          99946 non-null  object 
 11  browser                     

In [5]:
print('unique column values')
df.apply(lambda x: len(x.unique())).sort_values(ascending=False).head(33)

unique column values


browser_eng_Time                       99453
auction_id                             83071
site_name                               5779
creative_id                              312
game_key                                 180
browser_eng_Date                         160
campaign_name                             46
campaign_id                               46
browser                                   30
Submission Date                           27
Description                               27
startdate                                 22
enddate                                   21
Gross Cost/Budget                         17
Net Cost                                  16
Volume Agreed                             16
Campaign Objectives                       15
Serving Location(s)                       14
geo_country                               12
Buy Rate (CPE)                            11
kpis                                       9
height                                     8
currency  

# Classifying catagorical and numerical Datasets

In [6]:
# getting dataframe columns list
cols = df.columns.tolist()

In [7]:
# numerical values
num_cols = df._get_numeric_data().columns
num_cols

Index(['width', 'platform_os', 'Buy Rate (CPE)', 'Volume Agreed',
       'Gross Cost/Budget', 'Agency Fee', 'Percentage', 'Net Cost'],
      dtype='object')

In [8]:
# catagorical values
list(set(cols) - set(num_cols))

['Serving Location(s)',
 'browser_eng_Time',
 'Submission Date',
 'currency',
 'height',
 'enddate',
 'creative_id',
 'Placement(s)',
 'auction_id',
 'device_type',
 'site_name',
 'geo_country',
 'Description',
 'type',
 'campaign_name',
 'browser_eng_Date',
 'campaign_id',
 'browser',
 'Cost Centre',
 'Campaign Objectives',
 'startdate',
 'Black/white/audience list included?',
 'game_key',
 'kpis']

# Data Wrangling

In [9]:
df.head()

Unnamed: 0,game_key,campaign_id,type,width,height,creative_id,auction_id,geo_country,site_name,platform_os,...,Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Agency Fee,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,fd74243f-6606-4830-a0ef-dd12f66ec6f5,Thailand,www.wuxiaworld.com,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-01,19:55:20.291000
1,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,fd74243f-6606-4830-a0ef-dd12f66ec6f5,Thailand,www.wuxiaworld.com,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-01,19:55:20.298000
2,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,tf2htrrm,b3af878b-fd1a-4c6f-91a2-4e3670d2fda5,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-05,00:21:39.693000
3,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,b7c22590-e784-43cf-874d-a661ad99601f,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-06,09:30:44.188000
4,adunit-facebook-conversational-commerce-phase-...,l5kk3r3,impression,0,0,x83byc8a,b7c22590-e784-43cf-874d-a661ad99601f,Thailand,www.prachachat.net,6,...,SGP,USD,0.28,242185.0,67811.8,0.0,15.0,57640.03,2021-01-06,09:30:45.047000


In [10]:
df.columns

Index(['game_key', 'campaign_id', 'type', 'width', 'height', 'creative_id',
       'auction_id', 'geo_country', 'site_name', 'platform_os', 'device_type',
       'browser', 'campaign_name', 'Submission Date', 'Description',
       'Campaign Objectives', 'kpis', 'Placement(s)', 'startdate', 'enddate',
       'Serving Location(s)', 'Black/white/audience list included?',
       'Cost Centre', 'currency', 'Buy Rate (CPE)', 'Volume Agreed',
       'Gross Cost/Budget', 'Agency Fee', 'Percentage', 'Net Cost',
       'browser_eng_Date', 'browser_eng_Time'],
      dtype='object')

In [12]:
# unwanted column for ml model
columns_to_drop = ['game_key', 'campaign_id', 'creative_id', 'auction_id', 'campaign_name', 'height', 'width', 
                   'Submission Date', 'Description', 'Campaign Objectives', 'Black/white/audience list included?', 'Agency Fee']

In [13]:
df_copy = df.drop(columns_to_drop, axis=1)

In [14]:
df_copy.head()

Unnamed: 0,type,geo_country,site_name,platform_os,device_type,browser,kpis,Placement(s),startdate,enddate,Serving Location(s),Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,impression,Thailand,www.wuxiaworld.com,6,Mobile,Chrome,Engagement Rate,320x480 (Fullscreen mobile / interstitial)\n30...,2021-07-12,2021-09-08,Singapore,SGP,USD,0.28,242185.0,67811.8,15.0,57640.03,2021-01-01,19:55:20.291000
1,impression,Thailand,www.wuxiaworld.com,6,Mobile,Mobile Safari UI/WKWebView,Engagement Rate,320x480 (Fullscreen mobile / interstitial)\n30...,2021-07-12,2021-09-08,Singapore,SGP,USD,0.28,242185.0,67811.8,15.0,57640.03,2021-01-01,19:55:20.298000
2,impression,Thailand,www.prachachat.net,6,Mobile,Chrome Mobile,Engagement Rate,320x480 (Fullscreen mobile / interstitial)\n30...,2021-07-12,2021-09-08,Singapore,SGP,USD,0.28,242185.0,67811.8,15.0,57640.03,2021-01-05,00:21:39.693000
3,impression,Thailand,www.prachachat.net,6,Mobile,Mobile Safari,Engagement Rate,320x480 (Fullscreen mobile / interstitial)\n30...,2021-07-12,2021-09-08,Singapore,SGP,USD,0.28,242185.0,67811.8,15.0,57640.03,2021-01-06,09:30:44.188000
4,impression,Thailand,www.prachachat.net,6,Mobile,Chrome,Engagement Rate,320x480 (Fullscreen mobile / interstitial)\n30...,2021-07-12,2021-09-08,Singapore,SGP,USD,0.28,242185.0,67811.8,15.0,57640.03,2021-01-06,09:30:45.047000


In [15]:
# getting dataframe columns list
cols = df_copy.columns.tolist()

In [16]:
# numerical values
num_cols = df_copy._get_numeric_data().columns
num_cols

Index(['platform_os', 'Buy Rate (CPE)', 'Volume Agreed', 'Gross Cost/Budget',
       'Percentage', 'Net Cost'],
      dtype='object')

In [17]:
# catagorical values
list(set(cols) - set(num_cols))

['device_type',
 'Serving Location(s)',
 'browser_eng_Date',
 'browser',
 'browser_eng_Time',
 'Cost Centre',
 'type',
 'enddate',
 'site_name',
 'geo_country',
 'startdate',
 'kpis',
 'Placement(s)',
 'currency']

# Label Encoding

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
cols =['enddate',
 'Placement(s)',
 'Cost Centre',
 'geo_country',
 'site_name',
 'browser',
 'device_type',
 'type',
 'currency',
 'Serving Location(s)',
 'browser_eng_Date',
 'browser_eng_Time',
 'startdate',
 'kpis','platform_os']
#
# Encode labels of multiple columns at once
#
df_copy[cols] = df_copy[cols].apply(LabelEncoder().fit_transform)
#
# Print head
#
df_copy.head()

Unnamed: 0,type,geo_country,site_name,platform_os,device_type,browser,kpis,Placement(s),startdate,enddate,Serving Location(s),Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,2,8,5700,5,0,4,6,1,20,19,7,2,6,0.28,242185.0,67811.8,15.0,57640.03,23,75060
1,2,8,5700,5,0,19,6,1,20,19,7,2,6,0.28,242185.0,67811.8,15.0,57640.03,23,75061
2,2,8,4805,5,0,5,6,1,20,19,7,2,6,0.28,242185.0,67811.8,15.0,57640.03,27,1544
3,2,8,4805,5,0,18,6,1,20,19,7,2,6,0.28,242185.0,67811.8,15.0,57640.03,28,45867
4,2,8,4805,5,0,4,6,1,20,19,7,2,6,0.28,242185.0,67811.8,15.0,57640.03,28,45868


# Scale Data 

### Standard Scaler

In [23]:
scaler = DataCleaner(df_copy, deep=True)
normalizer = DataCleaner(df_copy, deep=True)

In [24]:
cols =['enddate',
 'Placement(s)',
 'Cost Centre',
 'geo_country',
 'site_name',
 'browser',
 'device_type',
 'type',
 'currency',
 'Serving Location(s)',
 'browser_eng_Date',
 'browser_eng_Time',
 'startdate',
 'kpis','platform_os']

In [25]:
df_cc = df_copy.drop(cols, axis=1)

In [26]:
df_cc.head()

Unnamed: 0,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Percentage,Net Cost
0,0.28,242185.0,67811.8,15.0,57640.03
1,0.28,242185.0,67811.8,15.0,57640.03
2,0.28,242185.0,67811.8,15.0,57640.03
3,0.28,242185.0,67811.8,15.0,57640.03
4,0.28,242185.0,67811.8,15.0,57640.03


In [27]:
# Using StandardScaler to standardize the all columns
scale_list = df_cc.columns.to_list()
scaler.standardize_columns(scale_list)

Unnamed: 0,type,geo_country,site_name,platform_os,device_type,browser,kpis,Placement(s),startdate,enddate,Serving Location(s),Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,2,8,5700,5,0,4,6,1,20,19,7,2,6,-1.454603,2.29958,-0.087472,0.968693,-0.172359,23,75060
1,2,8,5700,5,0,19,6,1,20,19,7,2,6,-1.454603,2.29958,-0.087472,0.968693,-0.172359,23,75061
2,2,8,4805,5,0,5,6,1,20,19,7,2,6,-1.454603,2.29958,-0.087472,0.968693,-0.172359,27,1544
3,2,8,4805,5,0,18,6,1,20,19,7,2,6,-1.454603,2.29958,-0.087472,0.968693,-0.172359,28,45867
4,2,8,4805,5,0,4,6,1,20,19,7,2,6,-1.454603,2.29958,-0.087472,0.968693,-0.172359,28,45868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99941,2,5,5625,4,0,18,6,1,12,10,7,2,4,2.396858,-0.45197,-0.629410,0.968693,-0.640576,37,65931
99942,2,5,5625,4,0,18,6,1,12,10,7,2,4,2.396858,-0.45197,-0.629410,0.968693,-0.640576,37,65935
99943,2,5,5625,4,0,18,6,1,12,10,7,2,4,2.396858,-0.45197,-0.629410,0.968693,-0.640576,37,65938
99944,1,5,5625,4,0,18,6,1,12,10,7,2,4,2.396858,-0.45197,-0.629410,0.968693,-0.640576,37,65939


### Normalize

In [28]:
normalizer.normalize_column(scale_list)

Unnamed: 0,type,geo_country,site_name,platform_os,device_type,browser,kpis,Placement(s),startdate,enddate,Serving Location(s),Cost Centre,currency,Buy Rate (CPE),Volume Agreed,Gross Cost/Budget,Percentage,Net Cost,browser_eng_Date,browser_eng_Time
0,2,8,5700,5,0,4,6,1,20,19,7,2,6,0.000001,0.938628,0.262816,0.000058,0.223394,23,75060
1,2,8,5700,5,0,19,6,1,20,19,7,2,6,0.000001,0.938628,0.262816,0.000058,0.223394,23,75061
2,2,8,4805,5,0,5,6,1,20,19,7,2,6,0.000001,0.938628,0.262816,0.000058,0.223394,27,1544
3,2,8,4805,5,0,18,6,1,20,19,7,2,6,0.000001,0.938628,0.262816,0.000058,0.223394,28,45867
4,2,8,4805,5,0,4,6,1,20,19,7,2,6,0.000001,0.938628,0.262816,0.000058,0.223394,28,45868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99941,2,5,5625,4,0,18,6,1,12,10,7,2,4,0.000012,0.770678,0.485527,0.000287,0.412698,37,65931
99942,2,5,5625,4,0,18,6,1,12,10,7,2,4,0.000012,0.770678,0.485527,0.000287,0.412698,37,65935
99943,2,5,5625,4,0,18,6,1,12,10,7,2,4,0.000012,0.770678,0.485527,0.000287,0.412698,37,65938
99944,1,5,5625,4,0,18,6,1,12,10,7,2,4,0.000012,0.770678,0.485527,0.000287,0.412698,37,65939


# Save Clean Data (data versioned - dvc)

In [29]:
scaler = DataCleaner(scaler.df)
scaler.save_clean_data('../data/scaled_data.csv')

In [30]:
normalizer = DataCleaner(normalizer.df)
normalizer.save_clean_data('../data/norm_data.csv')