### Import Modules

In [3]:
import pandas as pd
import sys
sys.path.append("../scripts")
from data_cleaner import DataCleaner

In [18]:
df = pd.read_csv("../data/AdSmartABdata.csv")
cleaner = DataCleaner()
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8077 entries, 0 to 8076
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   auction_id   8077 non-null   object
 1   experiment   8077 non-null   object
 2   date         8077 non-null   object
 3   hour         8077 non-null   int64 
 4   device_make  8077 non-null   object
 5   platform_os  8077 non-null   int64 
 6   browser      8077 non-null   object
 7   yes          8077 non-null   int64 
 8   no           8077 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 568.0+ KB


#### Use cleaner pipeline and clean the data

In [20]:
df = cleaner.run_pipeline(df)
df.info()
df.to_csv('../data/AdSmartABdata.csv')
#We can see that the data is cleaned and good to go

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   experiment   1243 non-null   object
 1   hour         1243 non-null   int64 
 2   platform_os  1243 non-null   int64 
 3   browser      1243 non-null   object
 4   day_of_week  1243 non-null   object
 5   brand        1243 non-null   object
 6   response     1243 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 68.1+ KB


[Instructions](#instructions)
### Steps to follow for versioning the data after making changes
#### Run the following commands in terminal
##### Repeat the below steps for every version you create
<ul>
    <li><i>dvc add data/AdSmartABdata.csv</i></li>
    <li><i>git add data/AdSmartABdata.csv.dvc</i></li>
    <li><i>git commit -m "Cleaned data saved as version 1"</i></li>
    <li><i>git tag -a 'v1' -m "Cleaned data"</i></li>
    <li><i>dvc push</i></li>
</ul>


#### Group based on browsers

In [15]:
df.groupby('browser').count()

Unnamed: 0_level_0,experiment,hour,platform_os,day_of_week,brand,response
browser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chrome,2,2,2,2,2,2
Chrome Mobile,695,695,695,695,695,695
Chrome Mobile WebView,227,227,227,227,227,227
Chrome Mobile iOS,1,1,1,1,1,1
Facebook,156,156,156,156,156,156
Mobile Safari,14,14,14,14,14,14
Mobile Safari UI/WKWebView,3,3,3,3,3,3
Samsung Internet,145,145,145,145,145,145


In [13]:
browser_top_five = df.groupby('browser')['experiment'].count().nlargest(5)
browser_top_five
#Selecting top 5 browsers

browser
Chrome Mobile            695
Chrome Mobile WebView    227
Facebook                 156
Samsung Internet         145
Mobile Safari             14
Name: experiment, dtype: int64

In [14]:
paths = []
for i, browser in enumerate(browser_top_five.index):
    browser_df = df.query(f"browser=='{browser}'").reset_index(drop=True)
    browser_df.drop(columns=['browser'], inplace=True)
    paths.append(f"../data/{browser.replace(' ', '_').lower()}_browser.csv")
    browser_df.to_csv(paths[i], index=False)

In [21]:
browser_names = {}
for i, browser in enumerate(browser_top_five.index):
    browser_df = df.query(f"browser=='{browser}'").reset_index(drop=True)
    browser_names[browser] = browser_df

browser_names.keys()

dict_keys(['Chrome Mobile', 'Chrome Mobile WebView', 'Facebook', 'Samsung Internet', 'Mobile Safari'])

In [22]:
browser_df=browser_names.get('Chrome Mobile')
browser_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,exposed,16,6,Chrome Mobile,Saturday,generic,1
1,exposed,8,6,Chrome Mobile,Monday,generic,0
2,control,15,6,Chrome Mobile,Friday,generic,0
3,exposed,2,6,Chrome Mobile,Friday,generic,0
4,control,15,6,Chrome Mobile,Friday,generic,1


In [23]:
#Create version 2 for Chrome Mobile
browser_df.to_csv('../data/AdSmartABdata.csv')

Follow <a id="instructions">Instruction</a> to version and tag correctly

In [24]:
browser_df=browser_names.get('Chrome Mobile WebView')
browser_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,exposed,2,6,Chrome Mobile WebView,Sunday,generic,0
1,exposed,6,6,Chrome Mobile WebView,Sunday,generic,0
2,exposed,20,6,Chrome Mobile WebView,Thursday,known brand,0
3,control,2,6,Chrome Mobile WebView,Monday,known brand,0
4,exposed,11,6,Chrome Mobile WebView,Saturday,known brand,1


In [25]:
#Create version 3 for Chrome Mobile WebView
browser_df.to_csv('../data/AdSmartABdata.csv')

In [26]:
browser_df=browser_names.get('Facebook')
browser_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,control,4,6,Facebook,Wednesday,known brand,1
1,control,15,6,Facebook,Friday,generic,0
2,exposed,13,6,Facebook,Thursday,known brand,1
3,control,20,6,Facebook,Thursday,known brand,1
4,control,15,6,Facebook,Friday,known brand,0


In [27]:
#Create version 4 for Facebook
browser_df.to_csv('../data/AdSmartABdata.csv')

In [28]:
browser_df=browser_names.get('Samsung Internet')
browser_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,control,15,6,Samsung Internet,Friday,known brand,0
1,exposed,20,6,Samsung Internet,Thursday,known brand,0
2,exposed,3,6,Samsung Internet,Tuesday,known brand,0
3,exposed,0,6,Samsung Internet,Thursday,known brand,0
4,control,15,6,Samsung Internet,Friday,known brand,0


In [29]:
#Create version 5 for Samsung Internet
browser_df.to_csv('../data/AdSmartABdata.csv')

In [30]:
browser_df=browser_names.get('Mobile Safari')
browser_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,control,19,6,Mobile Safari,Friday,known brand,0
1,exposed,8,5,Mobile Safari,Friday,known brand,0
2,control,15,5,Mobile Safari,Friday,known brand,0
3,control,3,5,Mobile Safari,Saturday,known brand,1
4,exposed,3,5,Mobile Safari,Sunday,known brand,1


In [31]:
#Create version 6 for Mobile Safari
browser_df.to_csv('../data/AdSmartABdata.csv')

#### Group based on OS platform

In [32]:
df.platform_os.unique()

array([6, 5])

In [35]:
os_names = {}
for os_name in df.platform_os.unique():
    os_df = df.query(f"platform_os=={os_name}").reset_index(drop=True)
    os_names[os_name] = os_df
os_names.keys()

dict_keys([6, 5])

In [38]:
os_df = os_names.get(6)
os_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,exposed,2,6,Chrome Mobile WebView,Sunday,generic,0
1,exposed,16,6,Chrome Mobile,Saturday,generic,1
2,exposed,8,6,Chrome Mobile,Monday,generic,0
3,control,4,6,Facebook,Wednesday,known brand,1
4,control,15,6,Chrome Mobile,Friday,generic,0


In [39]:
#Create version 7 for OS 6
os_df.to_csv('../data/AdSmartABdata.csv')

In [40]:
os_df = os_names.get(5)
os_df.head()

Unnamed: 0,experiment,hour,platform_os,browser,day_of_week,brand,response
0,exposed,8,5,Mobile Safari,Friday,known brand,0
1,control,15,5,Mobile Safari,Friday,known brand,0
2,control,15,5,Mobile Safari UI/WKWebView,Friday,known brand,0
3,control,3,5,Mobile Safari,Saturday,known brand,1
4,exposed,3,5,Mobile Safari,Sunday,known brand,1


In [41]:
#Create version 7 for OS 5
os_df.to_csv('../data/AdSmartABdata.csv')