In [1]:
## Import Libraries
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
from pandas.api.types import is_string_dtype, is_numeric_dtype
%matplotlib inline

In [2]:
CSV_PATH = "../data/impression_log.csv"

In [3]:
# taking a csv file path and reading a dataframe

def read_proccessed_data(csv_path):
    try:    
        df = pd.read_csv(csv_path)
        print("file read as csv")
        return df
    except FileNotFoundError:
        print("file not found")

In [4]:
## getting number of columns, row and column information
def get_data_info(Ilog_df: pd.DataFrame):
    
    row_count, col_count = Ilog_df.shape
    
    print(f"Number of rows: {row_count}")
    print(f"Number of columns: {col_count}")

    return Ilog_df.info()

In [5]:
## basic statistics of each column and see the data at glance
def get_statistics_info(Ilog_df: pd.DataFrame):
    
    return Ilog_df.describe(include='all')

In [6]:
# reading the extracted impression_log data and getting information
Ilog_df = read_proccessed_data(CSV_PATH)
get_data_info(Ilog_df)

get_statistics_info(Ilog_df)
Ilog_df.head()

file read as csv
Number of rows: 100000
Number of columns: 24
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          100000 non-null  object 
 1   LogEntryTime        100000 non-null  object 
 2   AdvertiserId        100000 non-null  object 
 3   CampaignId          100000 non-null  object 
 4   AdGroupId           100000 non-null  object 
 5   AudienceID          96546 non-null   object 
 6   CreativeId          100000 non-null  object 
 7   AdFormat            100000 non-null  object 
 8   Frequency           100000 non-null  int64  
 9   Site                100000 non-null  object 
 10  FoldPosition        100000 non-null  int64  
 11  Country             100000 non-null  object 
 12  Region              99999 non-null   object 
 13  City                99999 non-null   object 
 14  DeviceType          100

Unnamed: 0.1,Unnamed: 0,LogEntryTime,AdvertiserId,CampaignId,AdGroupId,AudienceID,CreativeId,AdFormat,Frequency,Site,...,DeviceType,OSFamily,OS,Browser,DeviceMake,AdvertiserCurrency,click,engagement,video-end,video-start
0,00006a06-14e2-47d9-b999-fbeeac67dd2b,2021-09-25 00:22:37,868ko1s,t29si1w,e9qf2dm,0,9wkrkl5j,300x250,0,scrabblewordfinder.org,...,4,5.0,173.0,7.0,Apple,0.006409,0,0,0,0
1,00007898-29c9-43f1-82e5-43c5a856d0f4,2021-09-25 02:31:35,868ko1s,fiwemi8,8m750eh,0,zhre4utp,300x250,0,www.thecoli.com,...,4,5.0,173.0,7.0,Apple,0.002778,0,0,0,0
2,0000d06f-5239-4123-af41-bd1c0e0e8d48,2021-09-25 20:12:52,868ko1s,awbu4q4,oropida,lnqjhqm,ab6spdyi,300x250,0,www.today.com,...,4,6.0,156.0,6.0,Samsung,0.008947,0,0,0,0
3,0000d623-aaf1-4be8-81f7-eb6c96b30aee,2021-09-22 16:04:12,868ko1s,t29si1w,e9qf2dm,0,2spj6krt,300x250,0,www.cbssports.com,...,4,5.0,101.0,7.0,Apple,0.007673,0,0,0,0
4,0000e836-9778-4ea9-b686-9e7bf26f90bd,2021-09-22 14:33:59,868ko1s,awbu4q4,oropida,lnqjhqm,ql1q5nq9,300x250,0,backroadramblers.com,...,4,5.0,101.0,7.0,Apple,0.008852,0,1,1,1


## Missing Values

In [7]:
def percent_missing(df):

        totalCells = np.product(df.shape)
        missingCount = df.isnull().sum()
        totalMissing = missingCount.sum()
        return round((totalMissing / totalCells) * 100, 2)
print("The Impression_log data dataset contains", percent_missing(Ilog_df), "%", "missing values.")

The Impression_log data dataset contains 0.15 % missing values.


## Handling Missing Values

In [8]:
def percent_missing_for_col(df, col_name: str):
    total_count = len(df[col_name])
    if total_count <= 0:
        return 0.0
    missing_count = df[col_name].isnull().sum()
    
    return round((missing_count / total_count) * 100, 2)

In [9]:
null_percent_df = pd.DataFrame(columns = ['column', 'null_percent'])
columns = Ilog_df.columns.values.tolist()
null_percent_df['column'] = columns
null_percent_df['null_percent'] = null_percent_df['column'].map(lambda x: percent_missing_for_col(Ilog_df, x))

In [10]:
null_percent_df.sort_values(by=['null_percent'], ascending = False)

Unnamed: 0,column,null_percent
5,AudienceID,3.45
18,DeviceMake,0.01
17,Browser,0.01
16,OS,0.01
0,Unnamed: 0,0.0
13,City,0.0
22,video-end,0.0
21,engagement,0.0
20,click,0.0
19,AdvertiserCurrency,0.0


### I used forward fill method to fill the missing values

In [11]:
Ilog_df['AudienceID'] = Ilog_df['AudienceID'].fillna(method='ffill')
Ilog_df['DeviceMake'] = Ilog_df['DeviceMake'].fillna(method='ffill')
Ilog_df['Browser'] = Ilog_df['Browser'].fillna(method='ffill')
Ilog_df['OS'] = Ilog_df['OS'].fillna(method='ffill')

In [12]:
Ilog_df['OSFamily'] = Ilog_df['OSFamily'].fillna(method='ffill')
Ilog_df['Region'] = Ilog_df['Region'].fillna(method='ffill')
Ilog_df['City'] = Ilog_df['City'].fillna(method='ffill')

In [13]:
#checking after handling the missing values
def percent_missing(df):
        totalCells = np.product(df.shape)
        missingCount = df.isnull().sum()
        totalMissing = missingCount.sum()
        return round((totalMissing / totalCells) * 100, 2)
print("The Impression_log data dataset contains", percent_missing(Ilog_df), "%", "missing values.")

The Impression_log data dataset contains 0.0 % missing values.


Remove dupilicate rows

In [14]:
Ilog_df.drop_duplicates(inplace=True)

In [15]:
Ilog_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99999 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          99999 non-null  object 
 1   LogEntryTime        99999 non-null  object 
 2   AdvertiserId        99999 non-null  object 
 3   CampaignId          99999 non-null  object 
 4   AdGroupId           99999 non-null  object 
 5   AudienceID          99999 non-null  object 
 6   CreativeId          99999 non-null  object 
 7   AdFormat            99999 non-null  object 
 8   Frequency           99999 non-null  int64  
 9   Site                99999 non-null  object 
 10  FoldPosition        99999 non-null  int64  
 11  Country             99999 non-null  object 
 12  Region              99999 non-null  object 
 13  City                99999 non-null  object 
 14  DeviceType          99999 non-null  int64  
 15  OSFamily            99999 non-null  float64
 16  OS  

In [16]:
Ilog_df.to_csv("../data/processed.csv",index=False)