In [ ]:
# standard imports
import json
from pathlib import Path
from itertools import chain

# third party imports
import numpy as np
import pandas as pd

In [ ]:
# constants
DATAFRAMES = Path('./dataframes')
PROCESSED = Path('./processed')

In [ ]:
# load dataframes
dfs = {file_.stem: pd.read_feather(file_)
        for file_ in DATAFRAMES.glob('*.ftr')}

In [ ]:
def check_nulls(df):
    s1 = df.isnull().sum()
    s2 = s1 / len(df.index)
    df_ = pd.DataFrame({'missing': s1, 'missing_%': s2})
    print(df_)

In [ ]:
def export_to_feather(df, output_path):
    df.reset_index().to_feather(output_path)

## Page Inits Processing

In [ ]:
pi = dfs['PageInits']

In [ ]:
pi.head()

Unnamed: 0,index,browserRequest.screenHeight,browserRequest.screenPixelDepth,browserRequest.screenWidth,campaignReferrer,deviceVentileGroup,pageId,pageInitializationId,productCategoryType,productPath,publisherUserId,request.requestedAt,segmentationType,request.nextRequestedAt,request.lastRequestedAt,request.timeDifference,sessionId
0,8936,960,0,1280,a88b7dcd1a9e3e17770bbaa6d7515b31a2d7e85d,10,flight.home,aa6621a4-2d8b-4c4e-be91-f0dbe23eb3d0,FLIGHTS,FLIGHTS,00001d27aa482ad2b982296c016365eed6441502,2019-10-17 09:54:25.819,LOW_CONVERTING,NaT,NaT,,1
1,4946,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,2,flight.home,ec4281fa-dd7a-4d82-ab43-ff5bbcebe5d6,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-08-05 02:44:42.940,LOW_CONVERTING,2019-08-05 02:45:10.157,NaT,,1
2,4965,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0,flight.list,20397260-1a09-4e54-86a0-e37085858ad1,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-08-05 02:45:10.157,LOW_CONVERTING,2019-10-20 17:11:21.537,2019-08-05 02:44:42.940,0.453617,1
3,14251,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:11:21.537,LOW_CONVERTING,2019-10-20 17:12:37.846,2019-08-05 02:45:10.157,110306.189667,2
4,14264,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0,flight.list,f6acbd41-21ad-4eac-b3c0-cba9fadbb87b,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:12:37.846,LOW_CONVERTING,2019-10-20 17:14:11.924,2019-10-20 17:11:21.537,1.271817,2


In [ ]:
check_nulls(pi)

                                 missing  missing_%
index                                  0   0.000000
browserRequest.screenHeight            0   0.000000
browserRequest.screenPixelDepth        0   0.000000
browserRequest.screenWidth             0   0.000000
campaignReferrer                       0   0.000000
deviceVentileGroup                     0   0.000000
pageId                                 0   0.000000
pageInitializationId                   0   0.000000
productCategoryType                    0   0.000000
productPath                            0   0.000000
publisherUserId                        0   0.000000
request.requestedAt                    0   0.000000
segmentationType                       0   0.000000
request.nextRequestedAt           529213   0.218794
request.lastRequestedAt           529213   0.218794
request.timeDifference            529213   0.218794
sessionId                              0   0.000000


### Aggregate Session Information to Join back

In [ ]:
# get session information
session_info = pi.groupby(['publisherUserId','sessionId']).agg({'request.requestedAt': 'min'}).reset_index()

# create new column for next sessions
session_info['nextSession'] = session_info.groupby('publisherUserId')['request.requestedAt'].shift(-1)

# create session timedelta
session_info['daysToNextSession'] = (session_info['nextSession'] - session_info['request.requestedAt']).dt.days

In [ ]:
# create target labels
for interval in [15, 30, 45, 60]:
    session_info[f'returnIn{interval}Days'] = (session_info['daysToNextSession'] <= interval).astype(int)

In [ ]:
# rename columns
session_info = session_info.rename(columns={'request.requestedAt': 'sessionTime'})

In [ ]:
session_info.columns.values

array(['publisherUserId', 'sessionId', 'sessionTime', 'nextSession',
       'daysToNextSession', 'returnIn15Days', 'returnIn30Days',
       'returnIn45Days', 'returnIn60Days'], dtype=object)

In [ ]:
pi_merged = pi.merge(session_info, how='inner', on=['publisherUserId', 'sessionId']).drop(columns='index')

In [ ]:
pi_merged.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days'], dtype=object)

In [ ]:
pi_merged.head()

Unnamed: 0,browserRequest.screenHeight,browserRequest.screenPixelDepth,browserRequest.screenWidth,campaignReferrer,deviceVentileGroup,pageId,pageInitializationId,productCategoryType,productPath,publisherUserId,...,request.lastRequestedAt,request.timeDifference,sessionId,sessionTime,nextSession,daysToNextSession,returnIn15Days,returnIn30Days,returnIn45Days,returnIn60Days
0,960,0,1280,a88b7dcd1a9e3e17770bbaa6d7515b31a2d7e85d,10,flight.home,aa6621a4-2d8b-4c4e-be91-f0dbe23eb3d0,FLIGHTS,FLIGHTS,00001d27aa482ad2b982296c016365eed6441502,...,NaT,,1,2019-10-17 09:54:25.819,NaT,,0,0,0,0
1,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,2,flight.home,ec4281fa-dd7a-4d82-ab43-ff5bbcebe5d6,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,NaT,,1,2019-08-05 02:44:42.940,2019-10-20 17:11:21.537,76.0,0,0,0,0
2,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0,flight.list,20397260-1a09-4e54-86a0-e37085858ad1,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,2019-08-05 02:44:42.940,0.453617,1,2019-08-05 02:44:42.940,2019-10-20 17:11:21.537,76.0,0,0,0,0
3,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,2019-08-05 02:45:10.157,110306.189667,2,2019-10-20 17:11:21.537,2019-10-20 18:04:43.568,0.0,1,1,1,1
4,912,24,1368,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0,flight.list,f6acbd41-21ad-4eac-b3c0-cba9fadbb87b,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,2019-10-20 17:11:21.537,1.271817,2,2019-10-20 17:11:21.537,2019-10-20 18:04:43.568,0.0,1,1,1,1


In [ ]:
# save to feather file
export_to_feather(pi_merged, f'{str(PROCESSED)}/pi.ftr')

## Conversions Processing

In [ ]:
cv = dfs['Conversions']

In [ ]:
cv.head()

Unnamed: 0,index,tripType,browserRequest.screenWidth,browserRequest.screenPixelDepth,travelDateStart,carPickupAirportCode,publisherUserId,pageId,carDropoffAirportCode,destination,...,travelDateEnd,origination,browserRequest.screenHeight,cookieSize,orderId,advancePurchaseRangeType,productCategoryType,request.requestedAt,request.requestId,daysFromTravel
0,12585,ONE_WAY,1680,24,2019-10-19 00:00:00+00:00,FLL,0002eaacc4a7f7c079e90bc88e9d3faea72c9dea,flight.confirmation,SEA,SEA,...,2019-10-19 00:00:00+00:00,FLL,1050,125,29bf57d0e19a38a4bdd12799d2ca5f818d7de878,WEEKEND_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-10-17 00:42:13.787,4,1
1,32655,ROUND_TRIP,1920,24,2019-11-09 00:00:00+00:00,NYC,000360d0b013b1138f404e3a87761f01ac203bd5,flight.confirmation,CTU,CTU,...,2019-11-23 00:00:00+00:00,NYC,1080,2544,2a632e6f5abf68250473d3c0d64fab3887470064,WEEKEND_TRAVEL_GREATER_THAN_21_DAYS,FLIGHTS,2019-10-03 15:32:38.162,4,36
2,17503,ROUND_TRIP,1280,24,2019-09-20 00:00:00+00:00,SFO,00053a150ddbae7c9b1720f73166a1ea94d0be54,flight.confirmation,NYC,NYC,...,2019-09-22 00:00:00+00:00,SFO,720,122,c99a8f9e8580d5ef62bfe5699dea9893dab19f93,WEEKEND_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-09-16 07:18:44.596,4,3
3,444,ONE_WAY,1280,24,2019-08-19 00:00:00+00:00,WAS,0006ac15bb4b5694e9acc380e2c48f87b15e19ee,flight.confirmation,BUF,BUF,...,2019-08-19 00:00:00+00:00,WAS,720,122,6b15e1013d386bd6aba6ac34487c22918f9a0d80,WEEKDAY_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-08-01 00:17:09.628,4,17
4,466,ONE_WAY,1280,24,2019-08-21 00:00:00+00:00,BUF,0006ac15bb4b5694e9acc380e2c48f87b15e19ee,flight.confirmation,BOS,BOS,...,2019-08-21 00:00:00+00:00,BUF,720,122,54ec0741611b00fc1c8621eae6880ff4caa190c4,WEEKDAY_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-08-02 03:51:56.537,4,18


In [ ]:
cv.columns.values

array(['index', 'tripType', 'browserRequest.screenWidth',
       'browserRequest.screenPixelDepth', 'travelDateStart',
       'carPickupAirportCode', 'publisherUserId', 'pageId',
       'carDropoffAirportCode', 'destination', 'travelers',
       'travelDateEnd', 'origination', 'browserRequest.screenHeight',
       'cookieSize', 'orderId', 'advancePurchaseRangeType',
       'productCategoryType', 'request.requestedAt', 'request.requestId',
       'daysFromTravel'], dtype=object)

In [ ]:
# drop index column
cv = cv.drop(columns=['index', 'origination', 'carDropoffAirportCode', 'carPickupAirportCode'])

In [ ]:
# add travel duration column
cv['travelDuration'] = (cv['travelDateEnd'] - cv['travelDateStart']).dt.days

In [ ]:
cv.head()

Unnamed: 0,tripType,browserRequest.screenWidth,browserRequest.screenPixelDepth,travelDateStart,publisherUserId,pageId,destination,travelers,travelDateEnd,browserRequest.screenHeight,cookieSize,orderId,advancePurchaseRangeType,productCategoryType,request.requestedAt,request.requestId,daysFromTravel,travelDuration
0,ONE_WAY,1680,24,2019-10-19 00:00:00+00:00,0002eaacc4a7f7c079e90bc88e9d3faea72c9dea,flight.confirmation,SEA,1,2019-10-19 00:00:00+00:00,1050,125,29bf57d0e19a38a4bdd12799d2ca5f818d7de878,WEEKEND_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-10-17 00:42:13.787,4,1,0
1,ROUND_TRIP,1920,24,2019-11-09 00:00:00+00:00,000360d0b013b1138f404e3a87761f01ac203bd5,flight.confirmation,CTU,1,2019-11-23 00:00:00+00:00,1080,2544,2a632e6f5abf68250473d3c0d64fab3887470064,WEEKEND_TRAVEL_GREATER_THAN_21_DAYS,FLIGHTS,2019-10-03 15:32:38.162,4,36,14
2,ROUND_TRIP,1280,24,2019-09-20 00:00:00+00:00,00053a150ddbae7c9b1720f73166a1ea94d0be54,flight.confirmation,NYC,1,2019-09-22 00:00:00+00:00,720,122,c99a8f9e8580d5ef62bfe5699dea9893dab19f93,WEEKEND_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-09-16 07:18:44.596,4,3,2
3,ONE_WAY,1280,24,2019-08-19 00:00:00+00:00,0006ac15bb4b5694e9acc380e2c48f87b15e19ee,flight.confirmation,BUF,2,2019-08-19 00:00:00+00:00,720,122,6b15e1013d386bd6aba6ac34487c22918f9a0d80,WEEKDAY_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-08-01 00:17:09.628,4,17,0
4,ONE_WAY,1280,24,2019-08-21 00:00:00+00:00,0006ac15bb4b5694e9acc380e2c48f87b15e19ee,flight.confirmation,BOS,2,2019-08-21 00:00:00+00:00,720,122,54ec0741611b00fc1c8621eae6880ff4caa190c4,WEEKDAY_TRAVEL_LESS_THAN_OR_EQUAL_TO_21_DAYS,FLIGHTS,2019-08-02 03:51:56.537,4,18,0


In [ ]:
check_nulls(cv)

                                 missing  missing_%
tripType                               0        0.0
browserRequest.screenWidth             0        0.0
browserRequest.screenPixelDepth        0        0.0
travelDateStart                        0        0.0
publisherUserId                        0        0.0
pageId                                 0        0.0
destination                            0        0.0
travelers                              0        0.0
travelDateEnd                          0        0.0
browserRequest.screenHeight            0        0.0
cookieSize                             0        0.0
orderId                                0        0.0
advancePurchaseRangeType               0        0.0
productCategoryType                    0        0.0
request.requestedAt                    0        0.0
request.requestId                      0        0.0
daysFromTravel                         0        0.0
travelDuration                         0        0.0


In [ ]:
# rename columns
cv = cv.rename(columns={'request.requestedAt': 'conversionTime'})

In [ ]:
export_to_feather(cv, f'{str(PROCESSED)}/cv.ftr')

## CSI Processing

In [ ]:
csi = dfs['ClientSideInfos']

In [ ]:
csi = csi.sort_values(['publisherUserId', 'request.requestedAt'])

In [ ]:
csi.head()

Unnamed: 0,index,customDimensions.documentWidth,pageId,customDimensions.viewportHeight,customDimensions.viewportWidth,customDimensions.documentHeight,customDimensions.label,pageInitializationId,publisherUserId,request.requestedAt,customDimensions.nextLabel
591986,591986,1351,flight.home,770,1368,3227,TRIP_TYPE,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:12:31.078000+00:00,TRIP_TYPE_ROUNDTRIP
592142,592142,1351,flight.home,770,1368,3227,TRIP_TYPE_ROUNDTRIP,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:12:31.089000+00:00,SEARCH
604746,604746,1351,flight.list,770,1368,5440,SEARCH,f6acbd41-21ad-4eac-b3c0-cba9fadbb87b,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:14:14.085000+00:00,SEARCH
590789,590789,1351,flight.home,770,1368,3227,FLIGHT_ORIGIN,562ded75-98d1-4499-8b01-36cab349771f,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:15:10.267000+00:00,FLIGHT_ORIGIN
590475,590475,1351,flight.home,770,1368,3227,FLIGHT_DESTINATION,562ded75-98d1-4499-8b01-36cab349771f,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:15:10.305000+00:00,FLIGHT_DESTINATION


In [ ]:
check_nulls(csi)

                                 missing  missing_%
index                                  0   0.000000
customDimensions.documentWidth         0   0.000000
pageId                                 0   0.000000
customDimensions.viewportHeight        0   0.000000
customDimensions.viewportWidth         0   0.000000
customDimensions.documentHeight        0   0.000000
customDimensions.label                 0   0.000000
pageInitializationId                   0   0.000000
publisherUserId                        0   0.000000
request.requestedAt                    0   0.000000
customDimensions.nextLabel        194705   0.135325


In [ ]:
# drop columns
csi = csi.drop(columns=['index', 'customDimensions.nextLabel'])

In [ ]:
csi.head()

Unnamed: 0,customDimensions.documentWidth,pageId,customDimensions.viewportHeight,customDimensions.viewportWidth,customDimensions.documentHeight,customDimensions.label,pageInitializationId,publisherUserId,request.requestedAt
591986,1351,flight.home,770,1368,3227,TRIP_TYPE,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:12:31.078000+00:00
592142,1351,flight.home,770,1368,3227,TRIP_TYPE_ROUNDTRIP,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:12:31.089000+00:00
604746,1351,flight.list,770,1368,5440,SEARCH,f6acbd41-21ad-4eac-b3c0-cba9fadbb87b,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:14:14.085000+00:00
590789,1351,flight.home,770,1368,3227,FLIGHT_ORIGIN,562ded75-98d1-4499-8b01-36cab349771f,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:15:10.267000+00:00
590475,1351,flight.home,770,1368,3227,FLIGHT_DESTINATION,562ded75-98d1-4499-8b01-36cab349771f,000030a6d85ad2effe150c581b1735ce243ab4ab,2019-10-20 17:15:10.305000+00:00


In [ ]:
export_to_feather(csi, f'{str(PROCESSED)}/csi.ftr')

## Merge PI + CSI

In [ ]:
pi_csi = pi_merged.merge(csi, how='outer', on=['pageInitializationId','publisherUserId'], suffixes=('', '_csi'))
pi_csi.head()

Unnamed: 0,browserRequest.screenHeight,browserRequest.screenPixelDepth,browserRequest.screenWidth,campaignReferrer,deviceVentileGroup,pageId,pageInitializationId,productCategoryType,productPath,publisherUserId,...,returnIn30Days,returnIn45Days,returnIn60Days,customDimensions.documentWidth,pageId_csi,customDimensions.viewportHeight,customDimensions.viewportWidth,customDimensions.documentHeight,customDimensions.label,request.requestedAt_csi
0,960.0,0.0,1280.0,a88b7dcd1a9e3e17770bbaa6d7515b31a2d7e85d,10.0,flight.home,aa6621a4-2d8b-4c4e-be91-f0dbe23eb3d0,FLIGHTS,FLIGHTS,00001d27aa482ad2b982296c016365eed6441502,...,0.0,0.0,0.0,,,,,,,NaT
1,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,2.0,flight.home,ec4281fa-dd7a-4d82-ab43-ff5bbcebe5d6,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,0.0,0.0,0.0,,,,,,,NaT
2,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0.0,flight.list,20397260-1a09-4e54-86a0-e37085858ad1,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,0.0,0.0,0.0,,,,,,,NaT
3,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14.0,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,1.0,1.0,1.0,1351.0,flight.home,770.0,1368.0,3227.0,TRIP_TYPE,2019-10-20 17:12:31.078000+00:00
4,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14.0,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,1.0,1.0,1.0,1351.0,flight.home,770.0,1368.0,3227.0,TRIP_TYPE_ROUNDTRIP,2019-10-20 17:12:31.089000+00:00


In [ ]:
pi_csi.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days',
       'customDimensions.documentWidth', 'pageId_csi',
       'customDimensions.viewportHeight',
       'customDimensions.viewportWidth',
       'customDimensions.documentHeight', 'customDimensions.label',
       'request.requestedAt_csi'], dtype=object)

In [ ]:
# drop redundant columns
pi_csi = pi_csi.drop(columns=['pageId_csi', 'request.requestedAt_csi'])

In [ ]:
# merge conversions
pi_csi_cv = pi_csi.merge(cv, how='outer', on=['publisherUserId'], suffixes=('', '_cv'))
pi_csi_cv.head()

Unnamed: 0,browserRequest.screenHeight,browserRequest.screenPixelDepth,browserRequest.screenWidth,campaignReferrer,deviceVentileGroup,pageId,pageInitializationId,productCategoryType,productPath,publisherUserId,...,travelDateEnd,browserRequest.screenHeight_cv,cookieSize,orderId,advancePurchaseRangeType,productCategoryType_cv,conversionTime,request.requestId,daysFromTravel,travelDuration
0,960.0,0.0,1280.0,a88b7dcd1a9e3e17770bbaa6d7515b31a2d7e85d,10.0,flight.home,aa6621a4-2d8b-4c4e-be91-f0dbe23eb3d0,FLIGHTS,FLIGHTS,00001d27aa482ad2b982296c016365eed6441502,...,NaT,,,,,,NaT,,,
1,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,2.0,flight.home,ec4281fa-dd7a-4d82-ab43-ff5bbcebe5d6,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,NaT,,,,,,NaT,,,
2,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,0.0,flight.list,20397260-1a09-4e54-86a0-e37085858ad1,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,NaT,,,,,,NaT,,,
3,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14.0,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,NaT,,,,,,NaT,,,
4,912.0,24.0,1368.0,d0941e68da8f38151ff86a61fc59f7c5cf9fcaa2,14.0,flight.home,b5df6141-a133-49b2-8c55-ff4ed4fe3c58,FLIGHTS,FLIGHTS,000030a6d85ad2effe150c581b1735ce243ab4ab,...,NaT,,,,,,NaT,,,


In [ ]:
pi_csi_cv.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days',
       'customDimensions.documentWidth',
       'customDimensions.viewportHeight',
       'customDimensions.viewportWidth',
       'customDimensions.documentHeight', 'customDimensions.label',
       'tripType', 'browserRequest.screenWidth_cv',
       'browserRequest.screenPixelDepth_cv', 'travelDateStart',
       'pageId_cv', 'destination', 'travelers', 'travelDateEnd',
       'browserRequest.screenHeight_cv', 'cookieSize', 'orderId',
       'ad

In [ ]:
converters = pi_csi_cv[pi_csi_cv['orderId'].notnull()]
# filter converters that fit the pi window
converters = converters.loc[(converters['conversionTime'] > converters['request.requestedAt']) &
                            (converters['conversionTime'] < converters['request.nextRequestedAt']), :]

non_converters = pi_csi_cv[pi_csi_cv['orderId'].isnull()]

In [ ]:
# union converters and non-converters
pi_csi_cv_full = pd.concat([converters, non_converters], axis=0, ignore_index=True)

In [ ]:
pi_csi_cv_full.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days',
       'customDimensions.documentWidth',
       'customDimensions.viewportHeight',
       'customDimensions.viewportWidth',
       'customDimensions.documentHeight', 'customDimensions.label',
       'tripType', 'browserRequest.screenWidth_cv',
       'browserRequest.screenPixelDepth_cv', 'travelDateStart',
       'pageId_cv', 'destination', 'travelers', 'travelDateEnd',
       'browserRequest.screenHeight_cv', 'cookieSize', 'orderId',
       'ad

In [ ]:
# drop redundant columns
pi_csi_cv_full = pi_csi_cv_full.drop(columns=['browserRequest.screenWidth_cv', 'browserRequest.screenPixelDepth_cv','browserRequest.screenHeight_cv',
                                                'pageId_cv', 'productCategoryType_cv', 'advancePurchaseRangeType'])

In [ ]:
pi_csi_cv_full.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days',
       'customDimensions.documentWidth',
       'customDimensions.viewportHeight',
       'customDimensions.viewportWidth',
       'customDimensions.documentHeight', 'customDimensions.label',
       'tripType', 'travelDateStart', 'destination', 'travelers',
       'travelDateEnd', 'cookieSize', 'orderId', 'conversionTime',
       'request.requestId', 'daysFromTravel', 'travelDuration'],
      dtype=object)

In [ ]:
pi_csi_cv_full = pi_csi_cv_full.sort_values(['publisherUserId', 'request.requestedAt'])

In [ ]:
# converted columns
pi_csi_cv_full['converted'] = pi_csi_cv_full['orderId'].notnull().astype(int)

In [ ]:
# create next label columns
pi_csi_cv_full['nextLabel'] = pi_csi_cv_full.groupby(['publisherUserId', 'sessionId'])['customDimensions.label'].shift(-1)

# create column for bigram
pi_csi_cv_full['bigramLabel'] = pi_csi_cv_full['customDimensions.label'] + ' ' + pi_csi_cv_full['nextLabel']

In [ ]:
pi_csi_cv_full.columns.values

array(['browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'campaignReferrer',
       'deviceVentileGroup', 'pageId', 'pageInitializationId',
       'productCategoryType', 'productPath', 'publisherUserId',
       'request.requestedAt', 'segmentationType',
       'request.nextRequestedAt', 'request.lastRequestedAt',
       'request.timeDifference', 'sessionId', 'sessionTime',
       'nextSession', 'daysToNextSession', 'returnIn15Days',
       'returnIn30Days', 'returnIn45Days', 'returnIn60Days',
       'customDimensions.documentWidth',
       'customDimensions.viewportHeight',
       'customDimensions.viewportWidth',
       'customDimensions.documentHeight', 'customDimensions.label',
       'tripType', 'travelDateStart', 'destination', 'travelers',
       'travelDateEnd', 'cookieSize', 'orderId', 'conversionTime',
       'request.requestId', 'daysFromTravel', 'travelDuration',
       'converted', 'nextLabel', 'bigramLabel'], dtype=objec

In [ ]:
export_to_feather(pi_csi_cv_full, f'{str(PROCESSED)}/final_df.ftr')