In [None]:
import pandas as pd
import copy

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

# =======================================================================================================
# преобразуем входной датасет:
def work_with_info(input_file, out_file):
    with open(input_file, 'r') as f:
        wrl = f.read()
        re = {'u': '', 'A': '', 'B': '', 'C': '', 'D': ''}
        for key, value in re.items():
            wrl = wrl.replace(key, value)

    with open(out_file, 'w') as f_out:
        f_out.write(wrl)
        return out_file


# precision, recall, f1-score and accuracy
def report_about_work_model(y_train, y_pred, idx_test):
    report = classification_report(y_train[idx_test:], y_pred, target_names=['Female','Male'])
    print(report)


def created_model(x_train_sparse, x_test_sparse, y_train, idx_test):
    # инициализируем модель
    model = GradientBoostingClassifier()

    # обучение модели на тестовой выборке
    model.fit(x_train_sparse, y_train[:idx_test])

    # предоставим на вход нашей обученной модели часть тестовых данных
    y_pred = model.predict(x_test_sparse)

    # отчёт о работе модели: f1-score
    print('------------------------------------------\nОтчёт по работе модели:')
    report_about_work_model(y_train, y_pred, idx_test)
    print('------------------------------------------\n')

    # обучение модели на ВСЕЙ выборке
    model.fit(x_train_sparse[:idx_test], y_train[:idx_test])

    return model


def model_preparation():
    # ==================================================================================
    # загружаем обучающую выборку
    file_name = work_with_info('product_new.csv', 'dataset_with_gender.csv')
    new_df = pd.read_csv(file_name)

    # загружаем тестовую выборку
    # test_df = pd.read_csv('session_new.csv')

    # все записи, где есть пол
    idx_all = 33454

    # приводим значения 'female', 'male' под формат -1 и +1
    for item in range(idx_all):
        new_df['gender'][item] = -1 if new_df['gender'][item] == 'female' else 1

    # выбираем нашу целевую переменную
    y_train = new_df['gender'][:idx_all]
    y_train = y_train.apply(pd.to_numeric)

    # удаляем гендер
    new_df = new_df.drop(['gender'], axis=1)

    # 2000 записей для проверки точности модели
    idx_test = idx_all - 2000

    x_train_sparse = new_df[:idx_test]
    x_test_sparse = new_df[idx_test:idx_all]

    # =============================================================
    # Grad boosting
    # =============================================================
    my_model = created_model(x_train_sparse, x_test_sparse, y_train, idx_test)
    return my_model


def predict_for_one_session(my_model, ses, df):

    new_id = ses[1:]
    new_id = int(new_id)
    value = df.loc[df['session_id'] == new_id]
    predict = my_model.predict(value)
    return predict
