In [None]:
! ls

In [1]:
import json
import numpy as np

In [None]:
import os

import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# regression models
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


In [None]:
from catboost import Pool, CatBoostRegressor

In [None]:
def load_dataset(path, is_test=False, y_col='UpLift'):
    df = pd.read_csv(path)
    if is_test: 
        return df
    return df[df.columns[:-1]], df[y_col]


def build_model(mdl):
    model = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore')), 
                      ("model", mdl)])
    return model


def tune_model():
    X_train, y_train = load_dataset(X_TRAIN, Y_TRAIN)
    model = build_model()

    gs = GridSearchCV(model, GRID_PARAMS, scoring="accuracy", n_jobs=-1, cv=5)
    gs.fit(X_train, y_train)

    print("Best Hyperparameters: {}".format(gs.best_params_))
    print("Best score: {:.2f}%".format(100 * gs.best_score_))


def train_model(print_params=False):
    X_train, y_train = load_dataset(PATH)

    model = build_model()
    model.set_params(**PARAMS)

    if print_params:
        print(model.get_params())

    model.fit(X_train, y_train)

    joblib.dump(model, MODEL_NAME)


def test_model():
    X_test, y_test = load_dataset(X_TEST, Y_TEST)
    model = joblib.load(MODEL_NAME)

    y_pred = model.predict(X_test)

    print("MAE on the test set: {:.2f}%".format(
        100 * mean_absolute_error(y_test, y_pred)))

In [2]:
# submission prep 

def submission_prep(y_pred, offer_id_map_path, output_id):
    output_path = f'./data/submission{np.random.randint(1000, 2000)}.csv'   
    if  output_id: 
        output_path = f'./data/submission{output_id}.csv'
    Offer_IDs_test = json.load(open(offer_id_map_path, 'r'))
    sample_submission = pd.read_csv('./lentahack/20210521_sample_submission.csv')
    subm_offer_id_order = list(sample_submission['Offer_ID'])

    # check that we have correct Offer ID
    #set(subm_offer_id_order).difference(Offer_IDs_test)

    order_map = {k: v for v, k in enumerate(subm_offer_id_order)}

    submission = [None]*len(order_map)
    for offer_id, pred in zip(Offer_IDs_test, y_pred): 
        submission[order_map[offer_id]] = [offer_id, pred]

    print('Printing to', output_path)     
    with open(output_path, 'w') as fout: 
        print('Offer_ID,UpLift', file=fout)
        for o_id, y in submission: 
            print(f'{o_id},{y}', file=fout)

In [None]:
PATH = 'train_as_is.csv'
PATH_TEST = 'test_as_is.csv'
X_TRAIN, Y_TRAIN = load_dataset(PATH)
MODEL = LGBMRegressor()
GRID_PARAMS = _
PARAMS = _
MODEL_NAME = 'model_1'

In [None]:
X_tr, y_tr = load_dataset(PATH)

In [None]:
X_test = pd.read_csv('test_as_is.csv')

In [None]:
model = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore')), 
                  ("model", MODEL) 
                  ])

In [None]:
model.fit(X_tr, y_tr)

In [None]:
# checking right on the Training set 
y_pred =  model.predict(X_tr)
mean_absolute_error(y_tr, y_pred)

In [None]:
y_pred = model.predict(X_test)

In [None]:
len(y_pred), y_pred[:10]

## 1. Feat Set: Category Counts

In [None]:
!ls data

In [None]:
PATH_TR = './data/train_category_ct.csv'
PATH_TEST = './data/test_category_ct.csv'
X_all = pd.read_csv(PATH_TR)
X_tr_orig, y_tr_orig = load_dataset(PATH_TR)
X_test = load_dataset(PATH_TEST, True)

X_train, X_val, y_train, y_val = train_test_split(X_tr_orig, y_tr_orig, test_size=0.2)

MODEL = LGBMRegressor()
GRID_PARAMS = _
PARAMS = _
MODEL_NAME = 'model_LGBMRegressor'

#### LGBMRegressor

In [None]:
model = build_model(LGBMRegressor())

In [None]:
model.fit(X_train, y_train)
y_pred_val = model.predict(X_val)
y_pred_tr = model.predict(X_train)
mean_absolute_error(y_val, y_pred_val), mean_absolute_error(y_train, y_pred_tr) 

In [None]:
model.fit(X_tr_orig, y_tr_orig)
y_pred = model.predict(X_test)

submission_prep(y_pred, './data/test_Offer_ID_map_category_ct.json', 1)

In [None]:
model1 = build_model(GradientBoostingRegressor()) 
model2 = build_model(AdaBoostRegressor()) 
model3 = build_model(XGBRegressor()) 
model4 = build_model(Lasso()) 

In [None]:
models = [model1, model2, model3, model4]

In [None]:
model1['model']

In [None]:
# validation 
for m in models: 
    m.fit(X_train, y_train)
    y_pred_val = m.predict(X_val)
    y_pred_tr = m.predict(X_train)
    print(m["model"])
    print(mean_absolute_error(y_val, y_pred_val), mean_absolute_error(y_train, y_pred_tr)) 

In [None]:
CatBoostRegressor?

In [None]:
model5 = build_model(CatBoostRegressor())

In [None]:
# 3.811659175054392 2.3031921382709273
for m in [model5]:
    m.fit(X_train, y_train)
    y_pred_val = m.predict(X_val)
    y_pred_tr = m.predict(X_train)
    print(m["model"])
    print(mean_absolute_error(y_val, y_pred_val), mean_absolute_error(y_train, y_pred_tr)) 

In [None]:
model5.fit(X_tr_orig, y_tr_orig)
y_pred = model5.predict(X_test)

submission_prep(y_pred, './data/test_Offer_ID_map_category_ct.json', 2)

## AzureML

In [None]:
!pip install --upgrade azureml-core azureml-sdk azureml-train-automl azureml-accel-models azureml-widgets azureml-opendatasets azureml-cli-common azureml-mlflow azureml-contrib-services
!pip install --upgrade nimbusml

In [3]:
from azureml.core.workspace import Workspace

In [2]:
!ls ./AzureML/configuration.yml

./AzureML/configuration.yml


In [4]:
ws = Workspace.create(name='hackpromoworkspace8',
               subscription_id='<SUBSCRIPTION-ID>',
               resource_group='HackPromoLenta',
               create_resource_group=False,
               location='eastus2'
               )

Deploying StorageAccount with name hackpromstoragee88382939.
Deploying KeyVault with name hackpromkeyvault6e86ad35.
Deploying AppInsights with name hackprominsightse478de9f.
Deployed AppInsights with name hackprominsightse478de9f. Took 10.55 seconds.
Deployed KeyVault with name hackpromkeyvault6e86ad35. Took 22.88 seconds.
Deployed StorageAccount with name hackpromstoragee88382939. Took 29.68 seconds.
Deploying Workspace with name hackpromoworkspace8.
Deployed Workspace with name hackpromoworkspace8. Took 72.52 seconds.


In [16]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 1,
    "enable_early_stopping": True,
    "primary_metric": 'normalized_root_mean_squared_error',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

In [20]:
!ls data

offers_df_agg.csv  submission4.csv		       train_category_ct.csv
submission1.csv    test_Offer_ID_map_category_ct.json  train_other_ofrs.csv
submission2.csv    test_category_ct.csv
submission3.csv    test_other_ofrs.csv


In [6]:
import pandas as pd
PATH_TR = './data/train_category_ct.csv'
PATH_TEST = './data/test_category_ct.csv'
X_all = pd.read_csv(PATH_TR)
X_test = pd.read_csv(PATH_TEST)

In [21]:
PATH_TR = './data/train_other_ofrs.csv'
PATH_TEST = './data/test_other_ofrs.csv'
X_all = pd.read_csv(PATH_TR)
X_test = pd.read_csv(PATH_TEST)

In [17]:
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=X_all,
                             label_column_name="UpLift",
                             **automl_settings)

In [18]:
experiment = Experiment(ws, "uplift-cat-feats-MSRE-automl-1hr")
local_run = experiment.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
uplift-cat-feats-MSRE-automl-1hr,AutoML_1ec40eca-7359-42e2-9ed2-bda3a6463a97,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


Current status: EngineeredFeatureExplanations. Computation of engineered features completed
Current status: RawFeaturesExplanations. Computation of raw features started




Current status: RawFeaturesExplanations. Computation of raw features completed
Current status: BestRunExplainModel. Best run model explanations completed
****************************************************************************************************


In [10]:
%who

AutoMLConfig	 Experiment	 PATH_TEST	 PATH_TR	 Workspace	 X_all	 automl_config	 automl_settings	 experiment	 
local_run	 logging	 nimbusml	 os	 pd	 ws	 


In [11]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [15]:
best_run?

In [19]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: uplift-cat-feats-MSRE-automl-1hr,
Id: AutoML_1ec40eca-7359-42e2-9ed2-bda3a6463a97_30,
Type: None,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='regression', working_dir='/home/azureus...
)), ('elasticnet', ElasticNet(alpha=0.1061578947368421, copy_X=True, fit_intercept=True, l1_ratio=0.11421052631578947, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False))], verbose=False)), ('18', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('decisiontreeregressor', DecisionTreeRegressor(ccp_alpha=0.0, cri

In [20]:
y_pred = fitted_model.predict(X_test)
submission_prep(y_pred, './data/test_Offer_ID_map_category_ct.json', '1hr')

Printing to ./data/submission1hr.csv
