- all users are from US
- testing data contains only accounts that are created within 2014-07 to 2014-09
    - run a distribution test between date when account created and date of first booking
- training data does not contain date of first booking

In [1]:
import pdb
import csv
import pandas as pd
import os
from pprint import pprint as pp
from collections import defaultdict, Counter
from operator import itemgetter

import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import matplotlib.dates as mpldt

In [2]:
from dataminer import Dataminer

In [3]:
DATASET_PATHS = {'train_users': 'train_users_2.csv',
        'sessions': 'sessions.csv',
        'countries': 'countries.csv',
        'age_gender_bkts': 'age_gender_bkts.csv',
        'test_users': 'test_users.csv'}
DATASETS = {}

In [4]:
class Preprocessor (object):

    def __init__(self):
        pass

    def csv_path_to_dataframe(self, file):
        return pd.DataFrame.from_csv(file, header=0)

In [5]:
def fit_to_norm(data, x=None):

    mu, std = norm.fit(data)

    # Plot the histogram.
    plt.hist(data, bins=25, normed=True, alpha=0.6, color='g')

    # Plot the PDF.
    if not x:
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        
    p = norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2)
    title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
    plt.title(title)
    plt.show()

In [6]:
processor = Preprocessor()
data = processor.csv_path_to_dataframe(DATASET_PATHS['train_users'])

DATASETS['training'] = processor.csv_path_to_dataframe(DATASET_PATHS['train_users'])
DATASETS['age_gender_bkts'] = processor.csv_path_to_dataframe(DATASET_PATHS['age_gender_bkts'])
DATASETS['countries'] = processor.csv_path_to_dataframe(DATASET_PATHS['countries'])
DATASETS['sessions'] = processor.csv_path_to_dataframe(DATASET_PATHS['sessions'])
DATASETS['testing'] = processor.csv_path_to_dataframe(DATASET_PATHS['test_users'])

In [7]:
# for feature in DATASETS['sessions'].columns:
#     set(DATASETS['sessions'][feature])

In [14]:
# count country labels
dataminer = Dataminer()
dataminer.count_labels(DATASETS['training'].ix[:, 14].values)

Counter({'NDF': 124543, 'US': 62376, 'other': 10094, 'FR': 5023, 'IT': 2835, 'GB': 2324, 'ES': 2249, 'CA': 1428, 'DE': 1061, 'NL': 762, 'AU': 539, 'PT': 217})


In [None]:
# TODO: create seasonality baseline predictor (this is a form of clustering using labelled data)
# group data by destination > gender > age group
# plot/group data by frequency count vs. first booking date
# observe periods of high/low season

In [None]:
df = DATASETS['training']
df.columns

In [None]:
_df = df.dropna(subset=['date_account_created', 'date_first_booking'])
Counter(_df['affiliate_provider'])

In [None]:
delta_days = [delta for delta in (pd.to_datetime(_df['date_first_booking']) - pd.to_datetime(_df['date_account_created']))]

In [None]:
import warnings
warnings.filterwarnings("ignore")
_df['delta_days'] = pd.Series(delta_days, index=_df.index)

In [None]:
_df_by_group = dict()
for country in set(df['country_destination']):
    _df_by_group[country] = _df[_df['country_destination']==country]

In [None]:
_df_by_group.keys()

In [None]:
_df_by_feature = defaultdict(lambda: defaultdict())
features = ['gender', 'first_device_type','signup_app', 'language', 'affiliate_channel', 'affiliate_provider']
# features = ['age', 'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel','affiliate_provider', 'first_affiliate_tracked', 'signup_app','first_device_type', 'first_browser']
for feature in features:
     for group in set(_df[feature]):
        _df_by_feature[feature][group] = _df[_df[feature]==group]

In [None]:
_df_by_feature.keys()

In [None]:
_df_by_feature['affiliate_provider'].keys()

In [None]:
%matplotlib inline
_df_plot = _df_by_feature['affiliate_provider']['other']
_df_plot.groupby([_df_plot.country_destination]).count().plot(kind='bar', legend=False)

In [None]:
_df_by_feature['gender']['MALE'].groupby([_df_by_feature['gender']['MALE'].country_destination]).count().plot(kind='bar', legend=False)

In [None]:
_df_by_group['US']['language'].groupby([_df.language]).count().plot(kind='bar')

In [None]:
plt.subplot(221)
feature = 'first_browser'
_df_by_country['US'][feature].groupby([_df[feature]]).count().plot(kind='bar')
plt.subplot(222)
_df_by_country['CA'][feature].groupby([_df[feature]]).count().plot(kind='bar')
plt.subplot(223)
_df_by_country['ES'][feature].groupby([_df[feature]]).count().plot(kind='bar')
plt.subplot(224)
_df_by_country['AU'][feature].groupby([_df[feature]]).count().plot(kind='bar')

In [None]:
_df_by_country['US'][feature].groupby([_df[feature]]).count().plot(kind='bar')

In [None]:
_df_by_country['CA']['date_first_booking'].groupby([_df.date_first_booking.dt.month]).count().plot(kind='bar')

In [None]:
_df_by_group['CA']['language'].groupby([_df.language]).count().plot(kind='bar')

In [None]:
# indicates that if there other language preferences other than 'en', it should be directed to other countries
# types of activities that they do

In [None]:
# TODO
# start new workbook
# run various off-the-shelf ML algorithms
    # random forest, logistic regression, svm, knn clustering
    # ann, boost
# (?) run only for that particular season of the year (using only samples within that test set)
# cross-validate and optimize