In [1]:
# Импорт библиотек
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import lightgbm as lgbm



In [2]:
train_prepared = 'train_prepared.csv'
test_prepared = 'test_prepared.csv'
train_additional = 'train_additional_data.csv'
test_additional = 'test_additional_data.csv'

In [None]:
# Проверил NaN
df = pd.read_csv(train_prepared)
print(df.isna().any().any())
print(df['label'].value_counts(normalize=True))

In [None]:
# Проверил NaN
df_add = pd.read_csv(train_additional)
print(df_add.isna().any())
print(df_add['action'].value_counts(normalize=True))

In [None]:
# Подготовка данных train_prepared. Добавил и убрал некоторые фичи. Добавил additional_prepared
def train(file):
    df = pd.read_csv(file, parse_dates=['cut_date', 'first_date', 'last_date'])
    df = df.set_index('email')
    df['label'] = df['label'].astype(int)
    days_dif = ((df['last_date'] - df['first_date']) / np.timedelta64(1, 'D')).astype(int)
    df.drop(columns=['cut_date', 'first_date', 'last_date'], axis=1, inplace=True)
    df.insert(5, 'days_dif', days_dif)

    return df

def train_add(file):
    df = pd.read_csv(file)
    df = df.set_index('email')
    df['trial'] = df['trial'].fillna(0).astype(int)
    action_int = {'CREATE': 1, 'DELETE': 0}
    df['action'] = df['action'].map(action_int).astype(int)
    df_1 = df.sort_values('calday').groupby('email').tail(1)
    df_1.drop(columns=['calday'], axis=1, inplace=True)
    
    return df_1

def train_merging(df, df_add):
    df_final = pd.merge(df, df_add, left_index=True, right_index=True)
    df_final.head()
    
    return df_final

def train_for_models(file_1, file_2):
    X = train_merging(train(file_1), train_add(file_2)).drop('label', axis=1)
    y = train_merging(train(file_1), train_add(file_2))['label']#.astype(int)
    
    return X, y

print(train_for_models(train_prepared, train_additional))

In [None]:
# Аналогично для test
def test(file):
    df = pd.read_csv(file, parse_dates=['cut_date', 'first_date', 'last_date'])
    df = df.set_index('email')
    days_dif = ((df['last_date'] - df['first_date']) / np.timedelta64(1, 'D')).astype(int)
    df.drop(columns=['cut_date', 'first_date', 'last_date'], axis=1, inplace=True)
    df.insert(5, 'days_dif', days_dif)    

    return df

def test_add(file):
    df = pd.read_csv(file)
    df = df.set_index('email')
    df['trial'] = df['trial'].fillna(0).astype('int32')
    action_int = {'CREATE': 1, 'DELETE': 0}
    df['action'] = df['action'].map(action_int).astype('int32')
    df_1 = df.sort_values('calday').groupby('email').tail(1)
    df_1.drop(columns=['calday'], axis=1, inplace=True)
    
    return df_1

def test_for_models(file_1, file_2):
    X_test = pd.merge(test(file_1), test_add(file_2), left_index=True, right_index=True)
    
    return X_test

print(test_for_models(test_prepared, test_additional))

In [5]:
# Оценил качество предсказания методом K ближайших соседей. Потюнил гиперпараметры с помощью GridSearch
def knn_classifier(X, y):
    knn_clf = KNeighborsClassifier()
    params_grid = {'n_neighbors': [3, 5, 7]}
    X_train, X_test, y_train, y_tes = train_test_split(X, y, train_size=0.7)
    gs = GridSearchCV(knn_clf, params_grid, cv=3, verbose=2, scoring='roc_auc')
    gs.fit(X_train, y_train)
    print('KNeighbors Classifier Score:', gs.best_score_)
    print('KNeighbors Classifier Best Parameters:', gs.best_params_)

    return knn_clf

#X, y = train_for_models(train_prepared, train_additional)
#knn_classifier(X, y)

In [6]:
# Ананалогично для Light GBM. Score оказался значительно лучше
def lgbm_classifier(X, y):
    lgbm_clf = lgbm.LGBMClassifier(silent=False)
    params_grid = {'max_depth': [25, 50],
              'learning_rate' : [0.01, 0.05]
             }
    X_train, X_test, y_train, y_tes = train_test_split(X, y, train_size=0.7)
    gs = GridSearchCV(lgbm_clf, params_grid, cv=3, verbose=2, scoring='roc_auc')
    gs.fit(X_train, y_train)
    print('Light GBM Classifier Score:', gs.best_score_)
    print('Light GBM Classifier Best Parameters:', gs.best_params_)

    return lgbm_clf

#X, y = train_for_models(train_prepared, train_additional)
#lgbm_classifier(X, y)

In [17]:
# Финальное предсказание и сабмишн
def final_model(train_prepared, train_additional, test_prepared, test_additional):
    X_train, y_train = train_for_models(train_prepared, train_additional)
    X_test = test_for_models(test_prepared, test_additional)

    lgbm_model = lgbm.LGBMClassifier(silent=False, learning_rate=0.05, max_depth=25).fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict_proba(X_test)[:,0]
    
    X_test = pd.DataFrame(X_test.index)
    y_pred = pd.DataFrame(y_pred_lgbm)
    
    sub = pd.concat([X_test, y_pred], axis=1)
    sub = sub.set_index('email')
    sub.apply(lambda x: round(x, 3)).to_csv('submission_uma_tech_task.csv')
    
final_model(train_prepared, train_additional, test_prepared, test_additional)