In [None]:
import datetime

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import dill


def filter_data(df):
    df_filter = df.copy()
    columns_to_drop = [
        'ID',
        'Preferred Element'
    ]
    return df_filter.drop(columns_to_drop, axis=1)


def outliers(df):
    outliers_df = df

    def calculate_outliers(outliers_df):
        q25 = outliers_df.quantile(0.25)
        q75 = outliers_df.quantile(0.75)
        iqr = q75 - q25
        boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
        return boundaries

    boundaries = calculate_outliers(df['Height'])
    outliers_df.loc[outliers_df['Height'] < boundaries[0], 'Height'] = round(boundaries[0])
    outliers_df.loc[outliers_df['Height'] > boundaries[1], 'Height'] = round(boundaries[1])
    return outliers_df


def main():
    df = pd.read_csv('train.csv')
    X = df.drop('House', axis=1)
    y = df['House']


    preprocessor = Pipeline(steps=[
        ('drop_columns', FunctionTransformer(filter_data)),
        ('outliers', FunctionTransformer(outliers))
    ])

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder())
    ])

    transformer = ColumnTransformer(transformers=[
        ('numerical_transformer', numerical_transformer,
         make_column_selector(dtype_include=['int64', 'float64'])),
        ('categorical_transformer', categorical_transformer,
         make_column_selector(dtype_include=object))
    ])

    models = [
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        MLPClassifier(),
        SVC()
    ]

    best_pipe = None
    best_score = .0
    for model in models:
        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('transformer', transformer),
            ('modeling', model)
        ])

        score = cross_val_score(pipe, X, y, cv=4, scoring='accuracy', error_score='raise')
        print(f'model:{type(model).__name__}, acc_mean:{score.mean():.4f}, acc_std:{score.std():.4f}')
        if score.mean() > best_score:
            best_score = score.mean()
            best_pipe = pipe


    best_pipe.fit(X, y)
    print(f'best model: {type(best_pipe.named_steps["modeling"]).__name__}, accuracy: {best_score:.4f}')
    with open('train_model.pkl', 'wb') as file:
        dill.dump({
            'model': best_pipe,
            'metadata': {
                'name': 'train_model.pkl',
                'author': 'Bogomaz Ekaterina',
                'date': datetime.datetime.now(),
                'type': type(best_pipe.named_steps["modeling"]).__name__,
                'accuracy': best_score
            }
        }, file)


    df_test = pd.read_csv('Kaggle_test.csv')
    y = best_pipe.predict(df_test)
    predict_dict = pd.DataFrame({'ID': df_test.ID, 'House': y})
    predict_dict.to_csv('prediction.csv', index=False)


if __name__ == '__main__':
    main()