# AG Challenge

## Introduction

This notebook is structured as follows:
<br>__Data Analysis & Review__ - This section explains the data being used
<br>__Modelling__ - This section contains the predictive modelling
<br>__Results & Conclusions__ Wrap up of all information discovered and learnt in this analysis

# Data Analysis & Review

## Code Preparation

In [None]:
from pathlib import Path
import datetime
import os
from enum import Enum
import pickle
from typing import Any

import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt 
import ppscore as pps

from sklearn.impute import SimpleImputer


def cell_last_run():
    """
    Databricks notebooks tell you the last time you ran a cell
    which I find very useful to know, it's very easy with
    notebooks to forget to re-run cells after changes,
    now the cell running order can be checked
    """  
    print(f"cell run at {datetime.datetime.now().strftime('%H:%M:%S %Y_%m_%d')} \n")

cell_last_run()

#### Setup Snowflake and Useful Functions

In [None]:
cell_last_run()

class FileEnum(Enum):
    data_parent_folder_name = 'data'
    raw_data_folder_name = 'raw'
    original_data_folder_name = 'original'
    processed_data_folder_name = 'processed'
    raw_data_filename = ''
    training_data_filename = 'train.csv'
    testing_data_filename = 'test.csv'
    testing_folder_name = 'test'
    training_folder_name = 'train'
    artifacts_folder_name = 'artifacts'
    labels_filename = 'labels'
    
def get_artifact_folder() -> Path:
    return Path.joinpath(Path(os.getcwd()).parent, FileEnum.artifacts_folder_name.value)
    
    
def get_data_type_folder(data_type:str):
    if not data_type in ['raw', 'processed']:
        raise ValueError('data type not recognised')
        
    if data_type == 'raw':
        return FileEnum.raw_data_folder_name.value
    elif data_type == 'processed':
        return FileEnum.processed_data_folder_name.value
    
def save_test_data(test_data: pd.DataFrame, data_type: str):

    test_data_path = Path.joinpath(get_data_folder_path(),
                                    get_data_type_folder(data_type=data_type),
                                    FileEnum.testing_folder_name.value,
                                    FileEnum.testing_data_filename.value)

    if Path.is_file(test_data_path):
        print('data already saved, no action performed')
    else:
        test_data.to_csv(test_data_path)
        
def save_train_data(train_data: pd.DataFrame, data_type: str):
      
    test_data_path = Path.joinpath(get_data_folder_path(),
                                        get_data_type_folder(data_type=data_type),
                                        FileEnum.training_folder_name.value,
                                        FileEnum.training_data_filename.value)
    
    if Path.is_file(test_data_path):
        print('data already saved, no action performed')
    else:
        test_data.to_csv(test_data_path)
    
    
def get_data_folder_path() -> Path:
    """
    folder data assumed to be in cirrent directory, if not please
    update this function
    """
    return Path.joinpath(Path(os.getcwd()).parent, FileEnum.data_parent_folder_name.value)

def get_original_data_file() -> Path:
    """
    folder data assumed to be in current directory, if not please
    update this function
    """
    return Path.joinpath(get_data_folder_path(),
                                 FileEnum.raw_data_folder_name.value,
                                 FileEnum.original_data_folder_name.value,
                                 FileEnum.raw_data_filename.value)
    

def get_train_data_file(data_type: str) -> Path:
    """
    folder data assumed to be in current directory, if not please
    update this function
    """
        
    return Path.joinpath(get_data_folder_path(),
                             get_data_type_folder(data_type=data_type),
                             FileEnum.training_folder_name.value,
                             FileEnum.training_data_filename.value)
        

def get_test_data_file(data_type: str) -> Path:
    """
    folder data assumed to be in current directory, if not please
    update this function
    """
    
    return Path.joinpath(get_data_folder_path(),
                             get_data_type_folder(data_type=data_type),
                             FileEnum.testing_folder_name.value,
                             FileEnum.testing_data_filename.value)


def import_data(data_type: str, train_or_test: str = None) -> pd.DataFrame:
    """
    This function handles importing of data for training the model
    :return pd.DataFrame: training data for model or curating data
    """
    
    if not data_type in ['raw', 'processed']:
        raise ValueError('data type not recognised')
        
    if data_type == 'processed'and not train_or_test in ['train', 'test']:
        raise ValueError('data must be train or test')
        
    if train_or_test == 'train':
        return pd.read_csv(get_train_data_file(data_type=data_type), index_col=['uid'])
    elif train_or_test == 'test':
        return pd.read_csv(get_test_data_file(data_type=data_type), index_col=['uid'])
    elif not train_or_test:
        return pd.read_csv(get_original_data_file())
    

def create_train_test_sets(random_seed: int = 3142, training_set_proportion: float = 0.9):
    """
    splits the data into train and test
    """
    
    DATA_TYPE = 'raw'
    
    if Path.is_file(get_train_data_file(data_type=DATA_TYPE)):
        print('data already split, no action performed')
        return None
    
    # get the data
    all_data: pd.DataFrame = import_data(data_type='raw')
    
    # get training data
    train_data = all_data.sample(frac=training_set_proportion, random_state = random_seed)
    
    # get testing data
    
    train_uids = set(train_data['uid'])
    test_data_mask = all_data.apply(lambda row: False if row.uid in train_uids else True, axis=1)
    test_data = all_data[test_data_mask]
    
    # ensure we have not dropped any data
    assert test_data.shape[0] + train_data.shape[0] == all_data.shape[0]
    
    #set the uid as the index
    test_data.set_index(keys='uid', drop=True, inplace=True)
    train_data.set_index(keys='uid', drop=True, inplace=True)
    
    train_data.to_csv(get_train_data_file(data_type=DATA_TYPE))
    test_data.to_csv(get_test_data_file(data_type=DATA_TYPE))
    
    print('split complete.')
    
def save_artifacts(artifact: Any, filename: str):
    
    with open(Path.joinpath(get_artifact_folder(), filename), 'wb') as f:
        pickle.dump(artifact, f)
        
def get_artifacts(filename: str) -> Any:
    
    with open(Path.joinpath(get_artifact_folder(), filename), 'rb') as f:
        return pickle.load(f)

    

#### Plotting functions

In [None]:
cell_last_run()

def pretty_plot(data: pd.DataFrame, x: str=None, y=None, title=None):
    """
    set up printing of data 
    """
    if x:
        data.plot(x=x, y=y, figsize=(20, 5))
    else:
        data.plot(y=y, figsize=(20, 5), use_index=True)
    if title:
        ax = plt.gca()
        ax.set_title(title)

    plt.legend(bbox_to_anchor=(0.8, 0.2, 0.2, -0.5))
    plt.figure(figsize=(20, 5))
    


## Exploratory Data Analysis

### Initial review

I do not want to perform EDA on data in the test set, however some idea of the data is needed in order to correctly identify a testing set. This is what the initial data review is. Here I will get some idea of what the data looks like to decide how to split the data into train and test then traditional EDA techniques will be applied only the training data.

This approach reduces as much as possible any potential bias introduced into the preprocessing of the data.


In [None]:
cell_last_run()

all_data: pd.DataFrame = import_data(data_type='raw')


print(f"number of rows = {all_data.shape[0]}")
print(f"number of columns = {all_data.shape[1]}")

all_data.head(20)

From a quick review of the data we can see we have mixture of categorical variables, boolean variables, and some ordinal data. There is some missing data in some of the columns but that does not look to be a large issue. There is no dates attached to the data so in this analysis we are not able to take the temporal aspect of the data into account. It however may be useful to know the date/time of this data in future since distributional shift in both the predictors and labels can occur over time (and I suspect patterns of behaviour and spending during lockdown may not be representative of 'normal' times).

We have a uid column which may be useful to connect data in the future, obviously we do not want to put this into the model but it will be useful to keep around so this can be used as the index to the dataframe.

We have of the order of 10's of thousands of rows so there should be enough data for modelling possibly this might not be enough for a deep learning appraoch so will keep to more traditional modelling appraoches. Given the amount of data we should be find with removing 10% of the data for the training set.

### Split Data

In [None]:
cell_last_run()

create_train_test_sets()

### EDA

Now the test data set has been removed full EDA can be performed without the risk of biasing and preprocessing decisions. Next the training data will be imported and profiled using pandas profiling to investigate the data to see if any additional proprocessing is required.

In [None]:
cell_last_run()

train_data = import_data(data_type='raw', train_or_test='train')

train_data.head()

In [None]:
cell_last_run()

profile = ProfileReport(train_data, title='Pandas Profiling Report')
profile.to_widgets()

Looking at the overview of the data first there is a small amount of missing cells and no duplicate rows so this data set looks quite rich. Although an imputer may be needed to deal with missing data - this can be investigated when looking at the individual features.


Investigating the warnings we see:


All the other features have warnings aroun the number of zeros - since these represent counts of real world numbers this is likely not a problem although will investigate individually below.

Feature analysis (looking at the 'variable' tab of the profile widget):


Correlations:

Pandas profiling only produces correlations for variables whose dtypes are numeric (here already the numeric encoding of week days has already caused an issue). Correlation is less well defined for binary variables (once categorical variables are one hot encoded each feautre essentially becomes a binary feature). More work will be needed to understand the relationships between the categorical data.


 



### Interim Discussion of EDA

### remove unwated features

In [None]:
cell_last_run()

cols_to_keep: list = ['']

feature_data = train_data[cols_to_keep]


### Dealing with missing data

In [None]:
cell_last_run()

imp = SimpleImputer(missing_values=float('nan'), strategy='constant', fill_value='missing')

feature_data_eda = pd.DataFrame(imp.fit_transform(feature_data), columns=feature_data.columns, index=feature_data.index)

save_artifacts(imp, 'imputer.pkl')

feature_data_eda

### encoding weekdays

In [None]:
cell_last_run()

from sklearn.base import BaseEstimator

class WeekdayEncoder(BaseEstimator):
    def __init__(self):
        self.mapping = {0: 'sunday',
                       1: 'monday',
                       2: 'tuesday',
                       3: 'wednesday',
                       4: 'thursday',
                       5: 'friday',
                       6: 'saturday'}
        
    def fit(self, *args, **kwargs):
        pass
        
    def fit_transform(self, X, *args, **kwargs):
        return self.transform(X)
        
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        
        weekday = X['weekday'].apply(lambda x: self.mapping.get(x))
        
        X.drop(columns = ['weekday'], inplace=True)
        weekday = weekday.rename('weekday')
        
        return pd.merge(X, weekday, left_index=True, right_index=True)
        

In [None]:
cell_last_run()

wkd = WeekdayEncoder()

save_artifacts(wkd, 'weekday_transformer.pkl')

feature_data_eda = wkd.transform(feature_data_eda)
feature_data_eda

### Investigating Relationships Between Categorical Features

Since pandas profiling only produces correlations for columns it perceives to be numerical currently the realtionships between the remaining features are unknown. Traditional correlation is not a very usefull tool to resolve this is the predictive power score -more information can be found here https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598?gi=b742e6b11335

In [None]:
cell_last_run()

pps_matrix = pps.matrix(feature_data_eda)

#remove uninteresting scores
pps_matrix = pps_matrix[pps_matrix['ppscore'].apply(lambda x: x > 0.0001)]

pps_matrix.sort_values('ppscore', ascending=False)

From the preictive power scores above (ignoring the perfect scores of variables against themselves) none of the scores appear high enough for any concern. The closest realtionship is country and language learnt which I suspect is due to most non-English speaking countries electing to learn English so some realtionship there is anecdotally expected. Similarly with country and motivation have similar ppscores in both directions. Also attribtuion and country have a relationship in both directions - this could be due to explicit marketing straegies (or lack thereof in certain countries).

#### Save Data Ready for Inference

## EDA Conclusions