In [1]:
%load_ext autoreload
%autoreload 2

# Purpose
This notebook describes the typical activities carried out at the beginning to a project / thread when customer shares new data. We will be trying to understand the tables, columns and information flow. Typically we also look for data issues and confirm with respective owners for resolution. At the end of this activity, the data sources and their treatment is finalized. Code in this notebook will not be part of the production code.

This data is stored currently in the tiger databricks storage. This data is stored currently in the tiger databricks storage. The notebooks are configured to connect directly to the Databricks fielstore and pull/save relevant files, therefore it is not required to download the files. 

Contact [code templates support](code-templates-support@tigeranalytics.com) for access to databricks

# Imports

In [2]:
# Standard Library Imports
import os
import os.path as op
import sys
import time
import warnings
import re
import random

# Third Party imports
import yaml
import hvplot
import pandas as pd
import numpy as np
import holoviews as hv
import panel as pn
from pyspark_dist_explore import (
    Histogram,
    hist,
    distplot,
    pandas_histogram
)
from IPython.display import (
    display,
    display_html
)

# Spark imports
from pyspark.sql import (
    types as DT,
    functions as F,
    Window
)
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.tuning import (
    ParamGridBuilder,
    CrossValidator,
    CrossValidatorModel
)
from pyspark.ml.feature import (
    VectorAssembler,
    StandardScaler,
    StringIndexer,
    OneHotEncoderEstimator,
    Imputer
)
from pyspark.ml.evaluation import RegressionEvaluator




# Project Imports
from ta_lib.pyspark import (
    dp,
    features,
    model_gen,
    model_eval,
    utils,
    eda,
    context
)

# Setting Options
random.seed(0)
pn.extension('bokeh')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)



# Initialization

`config.yml` is used to store all the parameters required for the template

In [4]:
config_path = op.join(os.getcwd(),'conf', 'config.yml')
with open(config_path, 'r') as fp:
    config = yaml.load(fp)
config

{'all': {'core': 'default',
  'log_catalog': 'production',
  'data_catalog': 'local',
  'job_catalog': 'local'},
 'spark': {'spark.executer.cores': 4, 'spark.cores.max': 4}}

In [5]:
data_config_path = op.join(os.getcwd(),'conf/data_catalog', 'remote.yml')
with open(data_config_path, 'r') as fp:
    data_config = yaml.load(fp)
data_config

{'reference_date': datetime.date(2020, 8, 31),
 'num_days_prediction': 7,
 'raw': {'filesystem': 'dbfs',
  'base_path': '/FileStore/tables/vacation_partitioned/',
  'call_data_path': 'dial_summary.parquet',
  'last_activity_data_path': 'customer_activity.parquet',
  'booking_data_path': 'class_labels.parquet',
  'consumer_data_path': 'customer.parquet',
  'web_data_path': 'itr_data_*.parquet'},
 'clean': {'filesystem': 'dbfs',
  'base_path': '/FileStore/tables/vacation_clean/',
  'call_data_path': 'dial_summary.parquet',
  'last_activity_data_path': 'customer_activity.parquet',
  'booking_data_path': 'class_labels.parquet',
  'consumer_data_path': 'customer.parquet',
  'web_data_path': 'itr_data_'},
 'processed': {'filesystem': 'dbfs',
  'base_path': '/FileStore/tables/spark_warehouse/',
  'train': 'train.parquet',
  'test': 'test.parquet',
  'preds': 'predictions.parquet'}}

## Create spark session

`talib.pyspark.context` module is leveraged to build the sparksession so as to consider the spark session related params in the config file while building the session.

In [6]:
%%time
session = context.CustomSparkSession(config)
session.CreateSparkSession()
spark = session.spark
sc = session.sc

Wall time: 13.4 s


# Background

The client is a cruise vacation provider whose goal is to predict whether a given customer will make a booking for a cruise vacation in the next 3 months, based on his/her previous web, call and booking activities.

# Data Read

### Call Data 

The call data contains the calls placed by a customer with details of originating and destination numbers. 

In [8]:
df_call_data = dp.read_data(
    spark=spark,
    paths=[data_config['raw']['base_path'] + data_config['raw']['call_data_path']],
    fs=data_config['raw']['filesystem'],
)
df_call_data.printSchema()

root
 |-- originatingnumber: long (nullable = true)
 |-- dialednumber: long (nullable = true)
 |-- call_date: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- cel_class_code: string (nullable = true)
 |-- unq_add: string (nullable = true)
 |-- customer_id: long (nullable = true)



### Last Activity Data
This data contains details of previous web activities for each customer, including the type of activity and geographical details.

In [9]:
df_last_activity_data = dp.read_data(
    spark=spark,
    paths=[data_config['raw']['base_path'] + data_config['raw']['last_activity_data_path']],
    fs=data_config['raw']['filesystem'],
)
df_last_activity_data.printSchema()

root
 |-- load_date: string (nullable = true)
 |-- consumer_first_name: string (nullable = true)
 |-- consumer_last_name: string (nullable = true)
 |-- email_address: string (nullable = true)
 |-- cel_email_permission_flag: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- cel_class_code: string (nullable = true)
 |-- actvty_date: string (nullable = true)
 |-- actvty_type: string (nullable = true)
 |-- actvty_prod: string (nullable = true)
 |-- customer_id: long (nullable = true)



### Booking Data
The booking data contains previous booking details for customers who have travelled on vacation cruises, along with dates of booking, date of journey, mode of making the booking, demographic details of the customer, and booking tiers.

In [10]:
df_booking_data = dp.read_data(
    spark=spark,
    paths=[data_config['raw']['base_path'] + data_config['raw']['booking_data_path']],
    fs=data_config['raw']['filesystem'],
)
df_booking_data.printSchema()

root
 |-- booking_create_date: string (nullable = true)
 |-- booking_status_bk_level: string (nullable = true)
 |-- sailing_date: string (nullable = true)
 |-- curr_number_sail_nights: long (nullable = true)
 |-- meta_product_code: string (nullable = true)
 |-- cabin_class_sailed: string (nullable = true)
 |-- booking_channel: string (nullable = true)
 |-- choice_air_flag: string (nullable = true)
 |-- cruise_care_flag: string (nullable = true)
 |-- age: long (nullable = true)
 |-- gender_code: string (nullable = true)
 |-- dma: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- tier_at_sail: string (nullable = true)
 |-- family_flag_booking_level: string (nullable = true)
 |-- cel_class_code_sail: string (nullable = true)
 |-- customer_id: long (nullable = true)



### Consumer Related Data 

The customer data contains detailed information of customers' previous transactions, including months since last cruise, web event details (e.g. page clicks, page opens), share of various types of vacations, demographic data of customer etc

In [11]:
df_consumer_data = dp.read_data(
    spark=spark,
    paths=[data_config['raw']['base_path'] + data_config['raw']['consumer_data_path']],
    fs=data_config['raw']['filesystem'],
)
df_consumer_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- cel_class_code: string (nullable = true)
 |-- rci_class_code: string (nullable = true)
 |-- aza_class_code: string (nullable = true)
 |-- high_value_zip_desc: string (nullable = true)
 |-- et_active_flag: string (nullable = true)
 |-- jm_hml_segment_code: string (nullable = true)
 |-- last_cruise_months_ago: double (nullable = true)
 |-- click_to_open: double (nullable = true)
 |-- click_pct: double (nullable = true)
 |-- open_pct: double (nullable = true)
 |-- max_event_date: string (nullable = true)
 |-- cel_dma_desc: string (nullable = true)
 |-- cel_first_cruise_date: string (nullable = true)
 |-- cc_current_loyalty_tier_code: string (nullable = true)
 |-- booked_flag: string (nullable = true)
 |-- beach_relax_cruise_intention: double (nullable = true)
 |-- luxury_relax_cruise_intention: double (nullable = true)
 |-- natural_wonders_cruise_intenti: double (nullable = true)
 |-- new_metro_cruise_intention: double (nullable = true)
 |-- o

### Web Data
The web data contains the temporal details of page visits and web events for each customer.

In [12]:
df_web_data = dp.read_data(
    spark=spark,
    paths=[data_config['raw']['base_path'] + data_config['raw']['web_data_path']],
    fs=data_config['raw']['filesystem'],
)
df_web_data.printSchema()

root
 |-- visit_date: string (nullable = true)
 |-- visit_yr: integer (nullable = true)
 |-- visit_mo: integer (nullable = true)
 |-- device_type_name: string (nullable = true)
 |-- sec_time_spent_on_nbr: integer (nullable = true)
 |-- page_view_count: integer (nullable = true)
 |-- geo_seg_country_name: string (nullable = true)
 |-- visit_type: string (nullable = true)
 |-- customer_id: long (nullable = true)



### Consolidating data objects in a dictionary

In [13]:
data = {
    'call_data':df_call_data,
    'last_activity_data':df_last_activity_data,
    'booking_data':df_booking_data,
    'consumer_data':df_consumer_data,
    'web_data':df_web_data
}

# Data Discovery

Given the raw data from data ingestion, we would now like to explore and learn more details about the data.

The output of the step would be a summary report and discussion of any pertinent findings.

## Shape of Data

In [14]:
%%time
utils.display_as_tabs([(k, dp.get_shape(v)) for k,v in data.items()])

Wall time: 8.66 s


## Clean Column Names

Standardize the column names of the dataframe. Converts camelcase into snakecase

In [15]:
%%time
data = {k:dp.clean_columns(v) for k,v in data.items()}
utils.display_as_tabs([(k, v.columns) for k,v in data.items()])

Wall time: 81 ms


## Identification of columns types in the data

Obtaining the columns by different types of data (numerical, categorical, datelike and boolean)

In [16]:
%%time
types = {
    'nemerical': dp.list_numerical_columns,
    'cat_cols': dp.list_categorical_columns,
    'date_cols': dp.list_datelike_columns,
    'bool_cols': dp.list_boolean_columns
}
res = [(datakey, {typekey: typeval(dataval) for typekey, typeval in types.items()}) for datakey, dataval in data.items()]
utils.display_as_tabs(res)

Wall time: 5 ms


## Check for data consistency in Columns

Data consistency refers to any case related inconsistencies in an object column.

> Example -  Having "APPLE" and "apple" as part of cell values in the same column is considered as an inconsistency

In [17]:
%%time
utils.display_as_tabs([(k, dp.check_column_data_consistency(v)) for k,v in data.items()])

Wall time: 2min 44s


## Columns Unique Values Summary

A summary of number of distinct count and the ratio of num_unique to the total count is obtained.

This helps in identifying any categorical features sneaking in as numerical columns

In [18]:
%%time
utils.display_as_tabs([(k, eda.column_values_summary(v).T) for k,v in data.items()])

Wall time: 30.7 s


## Identification of Missing Values

This step summarizes the Number of Missing Values in each column of the data.

In [19]:
%%time
utils.display_as_tabs([(k, dp.identify_missing_values(v).toPandas()) for k,v in data.items()])

Wall time: 39.9 s


## Health Analysis of the data

This step generates a set of data analyses that could be useful to showcase to clients.

1. % of numerical columns in the data

2. % of missing values in the data

2. % of duplicated data points

In [20]:
%%time
utils.display_as_tabs([(k, eda.plot_health(v)) for k,v in data.items()])

Wall time: 57.9 s


# Missing values Plot

In [21]:
%%time
utils.display_as_tabs([(k, eda.missing_plot(v)) for k,v in data.items()])

Wall time: 38.9 s


## Missing data summary

In [22]:
%%time
utils.display_as_tabs([(k, eda.missing_value_summary(v)) for k,v in data.items()])

Wall time: 37.6 s


## Cardinality check of tables wrt consumer data

In [23]:
%%time
utils.display_as_tabs([(k, eda.setanalyse(df_consumer_data, v,"customer_id")) for k,v in data.items()])

Wall time: 1min 59s
