In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, MinMaxScaler
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
import utils
from importlib import reload

reload(utils)
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm


def split_data_targets(df):
    df_train = df.iloc[:, :-7]
    targets_train = get_first_buying(df.iloc[:, -7:])
    return df_train, targets_train


def calc_accuracy(y, answers):
    return (y == np.round(np.minimum(np.maximum(answers, 0), 16))).sum() / len(y)


def get_weights(columns, delta=1.2):
    n = max(columns) + 1
    weeks = (np.arange(n) - n % 7 + 7) // 7 + 1
    weights = (weeks / len(weeks))**delta
    return (weights / weights.sum())[columns]


def get_fraction(df, numbers={0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, delta=1.2):
    fractions = []
    for i in numbers:
        fractions.append(((df == i) * get_weights(df.columns.values, delta=delta)).sum(axis=1))
    return pd.DataFrame(pd.concat(fractions, axis=1))


def get_first_buying(df):
    first_nonzero = pd.Series(0, index=df.index)
    _df = df[df.sum(axis=1) != 0]
    mask = (_df.values != 0)
    x_nonzero = _df.values[mask]
    i_nonzero = [0] + mask.sum(axis=1).cumsum()[:-1].tolist()
    first_nonzero.loc[_df.index] = x_nonzero[i_nonzero]
    return first_nonzero


def get_last_buying_day(df):
    df_reverse = df.iloc[:, ::-1]
    df_days = np.repeat(
        np.array(df_reverse.columns).reshape(1, -1), repeats=len(df), axis=0)
    df_days = df_days * (df_reverse != 0)
    first_nonzero = pd.Series(0, index=df.index)
    _df = df_days[df_days.sum(axis=1) != 0]
    mask = (_df.values != 0)
    x_nonzero = _df.values[mask]
    i_nonzero = [0] + mask.sum(axis=1).cumsum()[:-1].tolist()
    first_nonzero.loc[_df.index] = x_nonzero[i_nonzero]
    return max(df_reverse.columns) - first_nonzero + 1


def generate_full_factors(df, full_delta_mean=1.2, full_delta_std=0.6, full_delta_frac=1.2):
    columns = [
        'full_mean_weight', 'full_std_weight', 'full_frac_0_weight',
        'full_frac_1_weight', 'full_frac_2_weight', 'full_frac_3_weight',
        'full_frac_4_weight', 'full_frac_5_weight', 'full_frac_6_weight',
        'full_frac_7_weight', 'full_frac_8_weight', 'full_frac_9_weight',
        'full_frac_10_weight', 'full_frac_11_weight', 'full_nonzero_mode'
    ]
    df_frac = get_fraction(df, delta=full_delta_frac)
    df_factor = pd.concat(
        [(df * get_weights(df.columns.values, delta=full_delta_mean)).mean(axis=1),
         (df * get_weights(df.columns.values, delta=full_delta_std)).std(axis=1), df_frac,
         pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index)], axis=1)
    df_factor.columns = columns
    return df_factor


def generate_nonzero_factors(df, nonzero_delta=1.2):
    columns = ['nonzero_mean_weight']
    df_nonzero = df[df != 0]
    weights = get_weights(df_nonzero.columns.values, delta=nonzero_delta)
    df_factor = pd.concat([(df_nonzero * weights).mean(axis=1)], axis=1)
    df_factor.columns = columns
    return df_factor


def generate_n_last_week_factors(df, n=1):
    columns = [
        "last_week_{}_mean".format(n),
        "last_week_{}_noncount".format(n),
        "last_week_{}_nonzero_mean".format(n),
        "last_week_{}_nonzero_mode".format(n),
        "last_week_{}_first_expense".format(n),
    ]
    max_date = max(df.columns)
    df_7n = df.loc[:, max_date - 7 * n + 1:max_date - 7 * (n - 1)]
    df_frac = get_fraction(df_7n)
    df_factor = pd.concat(
        [
            df_7n.mean(axis=1),
            (df_7n == 0).sum(axis=1),
            df_7n[df_7n != 0].mean(axis=1),
            pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index),
            get_first_buying(df_7n),
        ],
        axis=1)
    df_factor.columns = columns
    return df_factor


def generate_first_expense_factors(df, first_delta_frac=0.5, first_delta_mean=0.2):
    columns = [
        "first_expense_mean", 'first_expense_frac_0_weight',
        'first_expense_frac_1_weight', 'first_expense_frac_2_weight',
        'first_expense_frac_3_weight', 'first_expense_frac_4_weight',
        'first_expense_frac_5_weight', 'first_expense_frac_6_weight',
        'first_expense_frac_7_weight', 'first_expense_frac_8_weight',
        "first_expense_nonzero_mean", "first_expense_nonzero_mode"
    ]
    firsts = []
    for week in (df.columns[len(df.columns) % 7::].values.reshape(-1, 7)):
        df_week = df.loc[:, week]
        firsts.append(get_first_buying(df_week))
    df_expense = pd.DataFrame(pd.concat(firsts, axis=1))
    df_frac = get_fraction(df_expense, delta=first_delta_frac)
    df_factor = pd.concat(
        [
            (df_expense * get_weights(df_expense.columns.values, delta=first_delta_mean)).mean(axis=1),
            get_fraction(df_expense, {0, 1, 2, 3, 4, 5, 6, 7, 8}),
            df_expense[df_expense != 0].mean(axis=1),
            pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index)
        ], axis=1
    )
    df_factor.columns = columns
    return df_factor


def generate_prob_week_factors(df, prob_delta=0.2):
    columns = [
        "week_prob_1", "week_prob_2", "week_prob_3", "week_prob_4",
        "week_prob_5", "week_prob_6", "week_prob_7"
    ]
    weeks = []
    df_weight = (df != 0) * get_weights(df.columns.values, delta=prob_delta)
    for n in (np.arange(1, 8)):
        df_week = df_weight.loc[:, n % 7 + len(df_weight.columns) % 7::7]
        weeks.append(df_week.sum(axis=1))
    df_week = pd.DataFrame(pd.concat(weeks, axis=1))
    prob = df_week.values / (df_week.sum(axis=1).values.reshape(-1, 1) + 1e-6)
    prob_first = prob * np.concatenate([np.ones((len(prob), 1)), np.cumprod(1 - prob, axis=1)[:, :-1]], axis=1)
    df_factor = pd.DataFrame(prob_first, index=df.index)
    df_factor.columns = columns
    return df_factor


def generate_last_buying_factors(df):
    columns = ["last_buying_sum",
               "last_buying_day",
               "last_buying_day_3",
               "last_buying_day_4",
               "last_buying_day_5",
               "last_buying_day_6",
               "last_buying_day_7",
               "last_buying_day_8"]
    df_factor = pd.concat(
        [
            get_first_buying(df.iloc[:, ::-1]),
            get_last_buying_day(df),
            get_last_buying_day(df * (df == 3)),
            get_last_buying_day(df * (df == 4)),
            get_last_buying_day(df * (df == 5)),
            get_last_buying_day(df * (df == 6)),
            get_last_buying_day(df * (df == 7)),
            get_last_buying_day(df * (df == 8))
         ], axis=1
    )
    
    df_factor.columns = columns
    return df_factor


def generate_factors(df):
    with tqdm(total=8) as bar:
        df_full = generate_full_factors(df)
        bar.update()
        df_nonzero = generate_nonzero_factors(df)
        bar.update()
        df_1_last_week = generate_n_last_week_factors(df)
        bar.update()
        df_2_last_week = generate_n_last_week_factors(df, n=2)
        bar.update()
        df_3_last_week = generate_n_last_week_factors(df, n=3)
        bar.update()
        df_first_expense = generate_first_expense_factors(df)
        bar.update()
        df_prob_week = generate_prob_week_factors(df)
        bar.update()
        df_last_buying = generate_last_buying_factors(df)
        bar.update()
    df_factor = pd.concat(
        [
            df_full, df_nonzero, df_1_last_week, df_2_last_week,
            df_3_last_week, df_first_expense, df_prob_week, df_last_buying
        ],
        axis=1)
    return df_factor

### загрузка

In [None]:
df_data = pd.read_csv("train2.csv.xls").pivot(index='id', columns='date', values='sum').fillna(0)

### очистка

In [None]:
df_data.loc[:, 269] = df_data.loc[:, 270]

### разделение

In [None]:
df_test, targets_test = utils.split_data_targets(df_data)
df_train, targets_train = utils.split_data_targets(df_test)
df_train_2, targets_train_2 = utils.split_data_targets(df_train)
df_train_3, targets_train_3 = utils.split_data_targets(df_train_2)

### извлечение факторов

In [None]:
X_train = utils.generate_factors(df_train)
y_train = targets_train.values

In [None]:
X_train_2 = utils.generate_factors(df_train_2)
y_train_2 = targets_train_2.values

In [None]:
X_train_3 = utils.generate_factors(df_train_3)
y_train_3 = targets_train_3.values

In [None]:
X_test = utils.generate_factors(df_test)
y_test = targets_test.values

In [None]:
X = utils.generate_factors(df_data)

### models

In [None]:
xgb = XGBClassifier(
    base_score=0.35, 
    colsample_bytree=1, learning_rate=0.01, max_delta_step=0,
    max_depth=5, n_estimators=750, n_jobs=-1, subsample=1.0, random_state=1134,
)
X_conc = np.concatenate([X_test.values, X_train.values, X_train_2.values])
y_conc = np.concatenate([y_test, y_train, y_train_2])
xgb.fit(X_conc, y_conc)

In [None]:
preds_xgb = xgb.predict(X.values)

In [None]:
sub = pd.DataFrame([df_data.index, preds_xgb]).T.astype(int)

In [None]:
sub.columns = ['id', 'sum']

In [None]:
sub.astype(int).to_csv("sub_xgb.csv", index=False)