In [1]:
import os
import sys
root_dir = os.path.split(os.getcwd())[0]
src_dir = os.path.join(root_dir, "src")
if src_dir not in sys.path:
    sys.path.append(src_dir)
os.chdir(root_dir)

%load_ext ipyext.writeandexecute

'writeandexecute' magic loaded.


In [2]:
%%writeandexecute -i reportProgress src/models/stefan.py

import time
import datetime

progressInfo = {}

def formatTime(seconds):
    return "{}m {}s".format(round(seconds) // 60, round(seconds) % 60)

def reportProgress(name, current, count, updateFrequency=15):
    if not name in progressInfo:
        progressInfo[name] = {"lastUpdate": time.time(), "start": time.time(), "lastCount": current}
        print("starting {}...".format(name))
    elif current >= count - 1:
        print("finished {}".format(name))
    elif time.time() - progressInfo[name]["lastUpdate"] >= updateFrequency:
        print("computing {}, {}% done, {} elapsed, {} remaining".format(name, round(100.0 * current / count), formatTime(time.time() - progressInfo[name]["start"]), formatTime((time.time() - progressInfo[name]["lastUpdate"]) / (current - progressInfo[name]["lastCount"]) * (count - current))))
        progressInfo[name]["lastUpdate"] = time.time()
        progressInfo[name]["lastCount"] = current
        

In [3]:
%%writeandexecute -i createFeatures src/models/stefan.py

def createFeatures(user, day, userId3Visits, id3s, duration=7, sums=False):
    id3Visited = []
    for d in range(max(0, day - duration), day):
        if (user, d) in userId3Visits:
            id3Visited += userId3Visits[(user, d)]
    return [id3Visited.count(i) for i in id3s] + ([len(id3Visited)] if sums else [])

In [4]:
%%writeandexecute -i findX1Samples src/models/stefan.py

def findX1Samples(id3s, userId3Visits, minDayTrain, maxDayTrain, users, duration, cutoff=10000, verbose=False):
    X1 = {id3: [] for id3 in id3s}
    for i, user in enumerate(users):
        if verbose: reportProgress("computation of X1 samples", i, len(users))
        for day in range(maxDayTrain, minDayTrain - 1, -1):
            feat = createFeatures(user, day, userId3Visits, id3s, duration=duration, sums=True)
            for id3 in id3s:
                if len(X1[id3]) < cutoff and (user, day) in userId3Visits and id3 in userId3Visits[(user, day)]:
                    if not any([(user, d) in userId3Visits and id3 in userId3Visits[(user, d)] for d in range(max(0, day - 21), day)]):
                        X1[id3].append(feat)
    return X1

In [5]:
%%writeandexecute -i findX0Samples src/models/stefan.py

from random import shuffle

def findX0Samples(X1, id3s, verbose=False):
    X0 = {}
    for i, id3 in enumerate(id3s):
        if verbose: reportProgress("computation of X0 samples", i, len(id3s))
        rows = sum([X1[i] for i in id3s if not i == id3], [])
        shuffle(rows)
        X0[id3] = rows[:len(X1[id3])]
    return X0

In [6]:
%%writeandexecute -i createRegressors src/models/stefan.py

from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

def createRegressors(X0, X1, id3s, verbose=False):
    regressors = {}
    for i, id3 in enumerate(id3s):
        if verbose: reportProgress("fitting regressors", i, len(id3s))
        if len(X0[id3]) > 0 and len(X1[id3]) > 0:
            regressors[id3] = RandomForestRegressor(max_depth=3, n_estimators=3).fit(X0[id3] + X1[id3], [0] * len(X0[id3]) + [1] * len(X1[id3])) 
    return regressors

In [7]:
%%writeandexecute -i computePredictions src/models/stefan.py

def computePredictions(users, regressors, maxDayTrain, userId3Visits, duration, id3s, verbose=False):
    predictions = {user: {} for user in users}
    for i, user in enumerate(users):
        if verbose: reportProgress("predicting user behaviour", i, len(users))
        prof = createFeatures(user, maxDayTrain, userId3Visits, id3s, duration=duration, sums=True)
        visited = sum([userId3Visits[(user, d)] if (user, d) in userId3Visits else [] for d in range(max(0, maxDayTrain - 21), maxDayTrain)], [])
        for id3, regressor in regressors.items():
            if not id3 in visited:
                predictions[user][id3] = regressor.predict([prof])[0]
    return predictions

In [8]:
%%writeandexecute -i extractTopPredictions src/models/stefan.py

import pandas as pd

def extractTopPredictions(predictions, users, topCount=-1, verbose=False):
    #sum up
    if topCount == -1:
        topCount = len(predictions) // 20
    #regressorScores = {id3: regressors[id3].score(X0[id3] + X1[id3], [0] * len(X0[id3]) + [1] * len(X1[id3])) for id3 in id3s}
    predictedId3s = {user: sorted(predictions[user].keys(), key=lambda id3: -predictions[user][id3])[:5] for user in users}
    certainty = {user: sum([predictions[user][id3] for id3 in predictedId3s[user]]) for user in users}
    topUsers = sorted(users, key=lambda user: -certainty[user])[:topCount]
    
    #create df
    dfData = {"user_id": topUsers}
    for i in range(5):
        dfData["id3_{}".format(i+1)] = [predictedId3s[user][i] for user in topUsers]
    ret = pd.DataFrame(data=dfData)
    if verbose: print(ret)
    return ret

In [9]:
%%writeandexecute -i userVisits src/models/stefan.py

from ediblepickle import checkpoint

@checkpoint(work_dir="data/processed", key=lambda args, kwargs: "userDayVisits.{}rows".format(args[0]["id3"].count()))
def userVisits(df):
    userId3Visits = df.groupby(["user_id", "date"])["id3"].apply(lambda x: list(set(x))).to_dict()
    return userId3Visits

In [None]:
%%writeandexecute -i predict src/models/stefan.py

def predict(train, trainUsers=1000, verbose=False, duration=2, minImpressions=200):
    #find id3s and users
    if verbose: print("finding users and id3s...")
    id3s = train["id3"].unique()
    users = train["user_id"].unique()
    
    #apply request filter
    if verbose: print("applying request filter...")
    vc = train["user_id"].value_counts()
    activeUsers = list(vc[vc >= minImpressions].index)
    train = train[train["user_id"].isin(activeUsers)]
    
    #compute lookup tables
    if verbose: print("computing lookup tables...")
    minDayTrain = train["date"].min()
    maxDayTrain = train["date"].max()
    userId3Visits = userVisits(train)
    
    #predict
    X1 = findX1Samples(id3s, userId3Visits, minDayTrain, maxDayTrain, activeUsers[:trainUsers], duration, cutoff=1000, verbose=verbose)
    X0 = findX0Samples(X1, id3s, verbose=verbose)
    regressors = createRegressors(X0, X1, id3s, verbose=verbose)
    predictions = computePredictions(activeUsers, regressors, maxDayTrain, userId3Visits, duration, id3s, verbose=verbose)
    df = extractTopPredictions(predictions, activeUsers, verbose=verbose)
    return df

## Sample Usage

In [None]:
from data.util import scoreSubmission

score, predictions = scoreSubmission(lambda train: predict(train, verbose=True), verbose=True)
print(score)
predictions.to_csv("data/output/stefan.csv")

loading data set...
computing predictions...
finding users and id3s...
applying request filter...
computing lookup tables...
starting computation of X1 samples...
computing computation of X1 samples, 0% done, 0m 19s elapsed, 161m 34s remaining
computing computation of X1 samples, 0% done, 0m 35s elapsed, 132m 10s remaining
computing computation of X1 samples, 1% done, 0m 53s elapsed, 95m 10s remaining
computing computation of X1 samples, 1% done, 1m 9s elapsed, 89m 19s remaining
computing computation of X1 samples, 1% done, 1m 24s elapsed, 83m 11s remaining
computing computation of X1 samples, 2% done, 1m 40s elapsed, 66m 36s remaining
computing computation of X1 samples, 2% done, 1m 55s elapsed, 61m 26s remaining
computing computation of X1 samples, 3% done, 2m 13s elapsed, 58m 40s remaining
computing computation of X1 samples, 3% done, 2m 30s elapsed, 53m 6s remaining
computing computation of X1 samples, 4% done, 2m 45s elapsed, 48m 37s remaining
computing computation of X1 samples, 