In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, MinMaxScaler
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
import utils
from importlib import reload

reload(utils)
%matplotlib inline

In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm


def split_data_targets(df):
    df_train = df.iloc[:, :-7]
    targets_train = get_first_buying(df.iloc[:, -7:])
    return df_train, targets_train


def calc_accuracy(y, answers):
    return (y == np.round(np.minimum(np.maximum(answers, 0), 16))).sum() / len(y)


def get_weights(columns, delta=1.2):
    n = max(columns) + 1
    weeks = (np.arange(n) - n % 7 + 7) // 7 + 1
    weights = (weeks / len(weeks))**delta
    return (weights / weights.sum())[columns]


def get_fraction(df, numbers={0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, delta=1.2):
    fractions = []
    for i in numbers:
        fractions.append(((df == i) * get_weights(df.columns.values, delta=delta)).sum(axis=1))
    return pd.DataFrame(pd.concat(fractions, axis=1))


def get_first_buying(df):
    first_nonzero = pd.Series(0, index=df.index)
    _df = df[df.sum(axis=1) != 0]
    mask = (_df.values != 0)
    x_nonzero = _df.values[mask]
    i_nonzero = [0] + mask.sum(axis=1).cumsum()[:-1].tolist()
    first_nonzero.loc[_df.index] = x_nonzero[i_nonzero]
    return first_nonzero


def get_last_buying_day(df):
    df_reverse = df.iloc[:, ::-1]
    df_days = np.repeat(
        np.array(df_reverse.columns).reshape(1, -1), repeats=len(df), axis=0)
    df_days = df_days * (df_reverse != 0)
    first_nonzero = pd.Series(0, index=df.index)
    _df = df_days[df_days.sum(axis=1) != 0]
    mask = (_df.values != 0)
    x_nonzero = _df.values[mask]
    i_nonzero = [0] + mask.sum(axis=1).cumsum()[:-1].tolist()
    first_nonzero.loc[_df.index] = x_nonzero[i_nonzero]
    return max(df_reverse.columns) - first_nonzero + 1


def generate_full_factors(df, full_delta_mean=0.2, full_delta_std=1.0, full_delta_frac=2.2):
    columns = [
        'full_mean_weight', 'full_std_weight', 'full_frac_0_weight',
        'full_frac_1_weight', 'full_frac_2_weight', 'full_frac_3_weight',
        'full_frac_4_weight', 'full_frac_5_weight', 'full_frac_6_weight',
        'full_frac_7_weight', 'full_frac_8_weight', 'full_frac_9_weight',
        'full_frac_10_weight', 'full_frac_11_weight', 'full_nonzero_mode'
    ]
    df_frac = get_fraction(df, delta=full_delta_frac)
    df_factor = pd.concat(
        [(df * get_weights(df.columns.values, delta=full_delta_mean)).mean(axis=1),
         (df * get_weights(df.columns.values, delta=full_delta_std)).std(axis=1), df_frac,
         pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index)], axis=1)
    df_factor.columns = columns
    return df_factor


def generate_nonzero_factors(df, nonzero_delta=1.6):
    columns = ['nonzero_mean_weight']
    df_nonzero = df[df != 0]
    weights = get_weights(df_nonzero.columns.values, delta=nonzero_delta)
    df_factor = pd.concat([(df_nonzero * weights).mean(axis=1)], axis=1)
    df_factor.columns = columns
    return df_factor


def generate_n_last_week_factors(df, n=1):
    columns = [
        "last_week_{}_mean".format(n),
        "last_week_{}_noncount".format(n),
        "last_week_{}_nonzero_mean".format(n),
        "last_week_{}_nonzero_mode".format(n),
        "last_week_{}_first_expense".format(n),
    ]
    max_date = max(df.columns)
    df_7n = df.loc[:, max_date - 7 * n + 1:max_date - 7 * (n - 1)]
    df_frac = get_fraction(df_7n)
    df_factor = pd.concat(
        [
            df_7n.mean(axis=1),
            (df_7n == 0).sum(axis=1),
            df_7n[df_7n != 0].mean(axis=1),
            pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index),
            get_first_buying(df_7n),
        ],
        axis=1)
    df_factor.columns = columns
    return df_factor


def generate_first_expense_factors(df, first_delta_frac=0.0, first_delta_mean=0.2):
    columns = [
        "first_expense_mean", 'first_expense_frac_0_weight',
        'first_expense_frac_1_weight', 'first_expense_frac_2_weight',
        'first_expense_frac_3_weight', 'first_expense_frac_4_weight',
        'first_expense_frac_5_weight', 'first_expense_frac_6_weight',
        'first_expense_frac_7_weight', 'first_expense_frac_8_weight',
        "first_expense_nonzero_mean", "first_expense_nonzero_mode"
    ]
    firsts = []
    for week in (df.columns[len(df.columns) % 7::].values.reshape(-1, 7)):
        df_week = df.loc[:, week]
        firsts.append(get_first_buying(df_week))
    df_expense = pd.DataFrame(pd.concat(firsts, axis=1))
    df_frac = get_fraction(df_expense, delta=first_delta_frac)
    df_factor = pd.concat(
        [
            (df_expense * get_weights(df_expense.columns.values, delta=first_delta_mean)).mean(axis=1),
            get_fraction(df_expense, {0, 1, 2, 3, 4, 5, 6, 7, 8}),
            df_expense[df_expense != 0].mean(axis=1),
            pd.DataFrame(np.argmax(df_frac.values[:, 1:], axis=1) + 1, index=df.index)
        ], axis=1
    )
    df_factor.columns = columns
    return df_factor


def generate_prob_week_factors(df, prob_delta=0.2):
    columns = [
        "week_prob_1", "week_prob_2", "week_prob_3", "week_prob_4",
        "week_prob_5", "week_prob_6", "week_prob_7"
    ]
    weeks = []
    df_weight = (df != 0) * get_weights(df.columns.values, delta=prob_delta)
    for n in (np.arange(1, 8)):
        df_week = df_weight.loc[:, n % 7 + len(df_weight.columns) % 7::7]
        weeks.append(df_week.sum(axis=1))
    df_week = pd.DataFrame(pd.concat(weeks, axis=1))
    prob = df_week.values / (df_week.sum(axis=1).values.reshape(-1, 1) + 1e-6)
    prob_first = prob * np.concatenate([np.ones((len(prob), 1)), np.cumprod(1 - prob, axis=1)[:, :-1]], axis=1)
    df_factor = pd.DataFrame(prob_first, index=df.index)
    df_factor.columns = columns
    return df_factor


def generate_last_buying_factors(df):
    columns = ["last_buying_sum",
               "last_buying_day",
               "last_buying_day_3",
               "last_buying_day_4",
               "last_buying_day_5",
               "last_buying_day_6",
               "last_buying_day_7",
               "last_buying_day_8"]
    df_factor = pd.concat(
        [
            get_first_buying(df.iloc[:, ::-1]),
            get_last_buying_day(df),
            get_last_buying_day(df * (df == 3)),
            get_last_buying_day(df * (df == 4)),
            get_last_buying_day(df * (df == 5)),
            get_last_buying_day(df * (df == 6)),
            get_last_buying_day(df * (df == 7)),
            get_last_buying_day(df * (df == 8))
         ], axis=1
    )
    
    df_factor.columns = columns
    return df_factor


def generate_factors(
        df,
        full_delta_mean=1.2, full_delta_std=0.6, full_delta_frac=1.2,
        nonzero_delta=1.2,
        first_delta_frac=0.5, first_delta_mean=0.2,
        prob_delta=0.2
):
    with tqdm(total=8) as bar:
        df_full = generate_full_factors(df, full_delta_mean, full_delta_std, full_delta_frac)
        bar.update()
        df_nonzero = generate_nonzero_factors(df, nonzero_delta)
        bar.update()
        df_1_last_week = generate_n_last_week_factors(df)
        bar.update()
        df_2_last_week = generate_n_last_week_factors(df, n=2)
        bar.update()
        df_3_last_week = generate_n_last_week_factors(df, n=3)
        bar.update()
        df_first_expense = generate_first_expense_factors(df, first_delta_frac, first_delta_mean)
        bar.update()
        df_prob_week = generate_prob_week_factors(df, prob_delta)
        bar.update()
        df_last_buying = generate_last_buying_factors(df)
        bar.update()
    df_factor = pd.concat(
        [
            df_full, df_nonzero, df_1_last_week, df_2_last_week,
            df_3_last_week, df_first_expense, df_prob_week, df_last_buying
        ],
        axis=1)
    return df_factor


### загрузка

In [8]:
df_data = pd.read_csv("data/train2.csv.xls").pivot(index='id', columns='date', values='sum').fillna(0)

### очистка

In [9]:
df_data.loc[:, 269] = df_data.loc[:, 270]

### разделение

In [10]:
df_test, targets_test = split_data_targets(df_data)
df_train, targets_train = split_data_targets(df_test)
df_train_2, targets_train_2 = split_data_targets(df_train)
df_train_3, targets_train_3 = split_data_targets(df_train_2)
df_train_4, targets_train_4 = split_data_targets(df_train_3)

### извлечение факторов

In [11]:
X_train = generate_factors(df_train)
y_train = targets_train.values

100%|██████████| 8/8 [00:20<00:00,  4.28s/it]


In [12]:
X_train_2 = generate_factors(df_train_2)
y_train_2 = targets_train_2.values

100%|██████████| 8/8 [00:21<00:00,  4.34s/it]


In [13]:
X_train_3 = generate_factors(df_train_3)
y_train_3 = targets_train_3.values

100%|██████████| 8/8 [00:20<00:00,  4.15s/it]


In [14]:
X_train_4 = generate_factors(df_train_4)
y_train_4 = targets_train_4.values

100%|██████████| 8/8 [00:19<00:00,  3.95s/it]


In [15]:
X_test = generate_factors(df_test)
y_test = targets_test.values

100%|██████████| 8/8 [00:20<00:00,  4.22s/it]


In [16]:
X = generate_factors(df_data)

100%|██████████| 8/8 [00:21<00:00,  4.44s/it]


In [1]:
gbm_4 = lgb.LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.01, num_leaves=31, n_jobs=-1, random_state=1134)
gbm_4.fit(X_train_4.values, y_train_4)

gbm_3 = lgb.LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.01, num_leaves=31, n_jobs=-1, random_state=1134)
gbm_3.fit(X_train_3.values, y_train_3)

gbm_2 = lgb.LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.01, num_leaves=31, n_jobs=-1, random_state=1134)
gbm_2.fit(X_train_2.values, y_train_2)

gbm_1 = lgb.LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.01, num_leaves=31, n_jobs=-1, random_state=1134)
gbm_1.fit(X_train.values, y_train)

NameError: name 'lgb' is not defined

In [18]:
gbm_preds_test_4 = gbm_4.predict_proba(X_test.values)
print(utils.calc_accuracy(y_test, np.argmax(gbm_preds_test_4, axis=1)))

gbm_preds_test_3 = gbm_3.predict_proba(X_test.values)
print(utils.calc_accuracy(y_test, np.argmax(gbm_preds_test_3, axis=1)))

gbm_preds_test_2 = gbm_2.predict_proba(X_test.values)
print(utils.calc_accuracy(y_test, np.argmax(gbm_preds_test_2, axis=1)))

gbm_preds_test_1 = gbm_1.predict_proba(X_test.values)
print(utils.calc_accuracy(y_test, np.argmax(gbm_preds_test_1, axis=1)))

0.399
0.399109090909
0.399927272727
0.398218181818


In [32]:
gbm_0 = lgb.LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.01, num_leaves=31, n_jobs=-1)
gbm_0.fit(X_test.values, y_test)

gbm_preds_4 = gbm_4.predict_proba(X.values)
gbm_preds_3 = gbm_3.predict_proba(X.values)
gbm_preds_2 = gbm_2.predict_proba(X.values)
gbm_preds_1 = gbm_1.predict_proba(X.values)
gbm_preds_0 = gbm_0.predict_proba(X.values)

In [33]:
p0 = gbm_preds_0[:, :11]
p1 = gbm_preds_1[:, :11]
p2 = gbm_preds_2[:, :11]
p3 = gbm_preds_3[:, :11]
p4 = gbm_preds_4[:, :11]
p = 0.5 * p0 + 0.3 * p1 + 0.3 * p2 + 0.05 * p3 + 0.05 * p4

In [42]:
preds = np.argmax(p, axis=1)

In [34]:
pdf = pd.DataFrame(p)
pdf.to_csv("notebooks/4lgbm_500.csv", index=False)

In [35]:
sub = pd.DataFrame([df_data.index, np.argmax(p, axis=1)]).T.astype(int)
sub.columns = ['id', 'sum']
sub.astype(int).to_csv("sub_4lgbm_500.csv", index=False)