In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
training_data_file = './training_data.csv'
test_data_file = './test_data.csv'
training_data = pd.read_csv(training_data_file, sep=';')
test_data = pd.read_csv(test_data_file, sep=';', nrows=2190030)  # EOF inside string starting at row 2190031 

# Data processing

In [None]:
def process(data):
    df = data.copy()
    # create population-level model
    df = df.drop('id', axis=1)
    # extract possible predictive features from timestamp and make last start relative
    df['timestamp'] = df['timestamp'].astype('datetime64[ns]')
    df['lastStart'] = df['lastStart'].astype('datetime64[ns]')
    df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category')
    df['logLastStartH'] = np.log(((df['timestamp'] - df['lastStart']).dt.total_seconds() + 1) / 3600)
    df = df.drop('timestamp', axis=1)
    df = df.drop('lastStart', axis=1)
    # drop categorical columns that seem to have too many distinct values to be useful
    df = df.drop('sourceGameId', axis=1)
    df = df.drop('deviceType', axis=1)
    # drop categorical colums where distribution of labels doesn't match well between training and test data
    df = df.drop('campaignId', axis=1)
    df = df.drop('softwareVersion', axis=1)
    # logarithmic transforms
    df['logStartCount'] = np.log(df['startCount'] + 1)
    df = df.drop('startCount', axis=1)
    df['logViewCount'] = np.log(df['viewCount'] + 1)
    df = df.drop('viewCount', axis=1)
    df['logClickCount'] = np.log(df['clickCount'] + 1)
    df = df.drop('clickCount', axis=1)
    df['logInstallCount'] = np.log(df['installCount'] + 1)
    df = df.drop('installCount', axis=1)
    df['logStartCount1d'] = np.log(df['startCount1d'] + 1)
    df = df.drop('startCount1d', axis=1)
    df['logStartCount7d'] = np.log(df['startCount7d'] + 1)
    df = df.drop('startCount7d', axis=1)
    # set types
    df['platform'] = data['platform'].astype('category')
    df['country'] = data['country'].astype('category')
    df['connectionType'] = data['connectionType'].astype('category')
    # drop features with very little apparent predictive power
    df = df.drop('dayofweek', axis=1)
    df = df.drop('platform', axis=1)
    df = df.drop('connectionType', axis=1)
    # drop nan
    df = df.dropna()
    return df

tr = process(training_data)
te = process(test_data)

In [None]:
tr.head(20)

In [None]:
tr.info()

In [None]:
print("Check stats for continuous variables")
tr.describe()

In [None]:
print("Check number of categories for categorical variables")
for key in tr.select_dtypes(['category']).columns:
    print(key, len(tr[key].unique()))

In [None]:
print("Compare marginal distributions of training and test data")
for key in te.keys():
    print(key)
    if key in ['campaignId', 'softwareVersion', 'country']:
        tr_set = set(tr[key].unique())
        te_set = set(te[key].unique())
        print("- Training {} keys of which {} also in test set".format(
            len(tr_set), len(tr_set.intersection(te_set))))
        print("- Test {} keys of which {} not in training set".format(
            len(te_set), len(te_set.difference(tr_set))))
    else:
        tr[key].hist()
        plt.show()
        te[key].hist()
        plt.show()

# Model definition and feature selection

In [None]:
def joint_encode(data1, data2):
    # encode categorical variables as int
    # make sure training and test data use same encoding schema
    joint_data = pd.concat([data1, data2], axis=0)
    joint_data['country'] = joint_data['country'].astype('category')
    df1 = data1.copy()
    df2 = data2.copy()
    cat_columns = joint_data.select_dtypes(['category']).columns
    for cat_column in cat_columns:
        df1[cat_column] = joint_data[cat_column][:len(data1)].cat.codes
        df2[cat_column] = joint_data[cat_column][len(data1):].cat.codes
    return df1, df2

tre, tee = joint_encode(tr, te)

In [None]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.metrics import log_loss

class Predictor:
    """Naive Bayes predictor as probability of install was requested"""
    def __init__(self, data, only_features=[], not_features=[], train_frac=0.8):
        # set weights as dataset is biased
        c0, c1 = data.install.value_counts().tolist()
        self.w0 = c1 / (c1 + c0)
        self.w1 = c0 / (c1 + c0)
        # split
        self.train = data.sample(frac=train_frac, random_state=200)
        self.test = data.drop(self.train.index)
        # features to use
        self.cat_features = tr.select_dtypes(['category']).columns
        self.con_features = tr.select_dtypes(['float64']).columns
        if only_features:
            self.cat_features = [c for c in self.cat_features if c in only_features]
            self.con_features = [c for c in self.con_features if c in only_features]
        if not_features:
            self.cat_features = [c for c in self.cat_features if c not in not_features]
            self.con_features = [c for c in self.con_features if c not in not_features]
        # fit models
        self._fit_categorical_model()
        self._fit_continuous_model()
        
    def _fit_categorical_model(self):
        if not any(self.cat_features):
            self.catm = None
            return
        self.catm = CategoricalNB()
        self.catm.fit(self.train[self.cat_features], self.train['install'])

    def _fit_continuous_model(self):
        if not any(self.con_features):
            self.conm = None
            return
        self.conm = GaussianNB()
        self.conm.fit(self.train[self.con_features], self.train['install'])

    def pred(self, data):
        if self.catm and self.conm:
            cat_pred = self.catm.predict_proba(data[self.cat_features])[:,1]
            con_pred = self.conm.predict_proba(data[self.con_features])[:,1]
            return (cat_pred + con_pred) / 2  # roughly equally accurate
        if self.catm:
            return self.catm.predict_proba(data[self.cat_features])[:,1]
        if self.conm:
            return self.conm.predict_proba(data[self.con_features])[:,1]
        assert False
        
    def pred_eval(self, data):
        pred = self.pred(data)
        weights = data['install'] * self.w1 - (data['install'] - 1) * self.w0
        print("- Loss {:.3f}".format(log_loss(data['install'], pred, sample_weight=weights)))

        
all_features = tr.select_dtypes(['category', 'float64']).columns
print("All features")
p = Predictor(tre)
p.pred_eval(tre)

In [None]:
print("Individual feature predictivity")
for feature in all_features:
    print(feature)
    p = Predictor(tre, only_features=[feature])
    p.pred_eval(tre)

In [None]:
print("Effect of removing individual feature")
for feature in all_features:
    print(feature)
    p = Predictor(tre, not_features=[feature])
    p.pred_eval(tre)

# Generating predictions

In [None]:
p = Predictor(tre)
preds = p.pred(tee)
print(preds[:10])

In [None]:
out_df = pd.DataFrame({'prob_install': preds}, index=te.index.values.tolist())
out_df.head(20)

In [None]:
out_df.to_csv('test_preds.csv')