In [42]:
# pip install nb_black
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import numpy as np
import collections as c
import sklearn

from sklearn.preprocessing import MultiLabelBinarizer
import category_encoders as ce
from sklearn import preprocessing
from autosklearn.classification import AutoSklearnClassifier
from numpy import loadtxt
from numpy import sort
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import autosklearn
import copy
import pickle
import time

  from pandas import Int64Index as NumericIndex
  from pandas import MultiIndex, Int64Index


In [33]:
def type_separator(df: pd.DataFrame, print_results=True):
    set_separator = {}
    for col in df.columns:

        if type(df[col].dtype) == pd.CategoricalDtype:
            a = "cat"
        else:
            a = {
                np.dtype("object"): "cat",
                np.dtype("int"): "numerical",
                np.dtype("uint8"): "numerical",
                np.dtype("float"): "numerical",
            }[df[col].dtype]

        if a in list(set_separator.keys()):
            set_separator[a].append(col)
        else:
            set_separator[a] = [col]

    if print_results:
        for key, val in set_separator.items():
            print("type:", key, "columns:")
            for col in sorted(val):
                print(">", col)
            print("-" * 32)
        return None
    
    set_separator["all"] = list(df.columns)
    return set_separator

In [354]:
# Simple initial operations

df = pd.read_csv("./interim_data/df_completed_1_2_3.csv")

id_columns = ["CUSTOMER", "TEST_SET_ID", "IDX_CUSTOMER"]
unnecessary_reduced_cols = [
    "OFFER_TYPE_REDUCED_1",
    "OFFER_TYPE_REDUCED_2",
    "SALES_OFFICE_REDUCED",
]

to_be_dropped_cols = id_columns + unnecessary_reduced_cols
df = df.drop(to_be_dropped_cols, axis=1)

df = df[~np.isnan(df["OFFER_STATUS"])]
df["OFFER_STATUS"] = df["OFFER_STATUS"].astype(np.dtype("int"))

<IPython.core.display.Javascript object>

In [355]:
# new columns
df["ADDITIONAL_COST"] = df["OFFER_PRICE"] - df["MATERIAL_COST"] - df["SERVICE_COST"]
df["TOTAL_COST"] = df["MATERIAL_COST"] + df["SERVICE_COST"]

<IPython.core.display.Javascript object>

In [356]:
df_backup = df.copy()

<IPython.core.display.Javascript object>

In [357]:
type_to_col = type_separator(df, print_results=False)
df = df_backup.copy()
for col in type_to_col["cat"]:
    num_unq = len(df[col].unique())
    trimmed_col = col.strip().replace(" ", "_")
    if num_unq < 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying 1-HOT encoding.")
        onehot_df = pd.get_dummies(df[col])
        onehot_df = onehot_df.add_prefix(trimmed_col + "_1HOTENC_")
        df = pd.concat((df, onehot_df), axis=1)
    elif num_unq >= 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying BINARY encoding.")
        encoder = ce.BinaryEncoder(cols=[col])
        binenc_df = encoder.fit_transform(df[[col]])
        binenc_df.columns = [
            f"{trimmed_col}_BINENC_{i}" for i in range(len(binenc_df.columns))
        ]
        df = pd.concat((df, binenc_df), axis=1)
    else:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying NOTHING.----------")

[INFO] Col:PRICE_LIST,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH,num_of_unq:7, applying BINARY encoding.
[INFO] Col:OFFER_TYPE,num_of_unq:30, applying BINARY encoding.
[INFO] Col:BUSINESS_TYPE,num_of_unq:11, applying BINARY encoding.
[INFO] Col:SALES_LOCATION,num_of_unq:44, applying BINARY encoding.
[INFO] Col:SALES_OFFICE,num_of_unq:38, applying BINARY encoding.
[INFO] Col:OWNERSHIP,num_of_unq:5, applying BINARY encoding.
[INFO] Col:CURRENCY,num_of_unq:5, applying BINARY encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH_REDUCED_1,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO_REDUCED,num_of_unq:2, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA_REDUCED,num_of_unq:3, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_REDUCED,num_of_unq:3, applying 1-HOT encoding.


<IPython.core.display.Javascript object>

In [361]:
type_separator(df)

type: float columns:
> ADDITIONAL_COST
> CREATION_YEAR
> CREATION_YEAR_LOG
> MATERIAL_COST
> MATERIAL_COST_LOG
> OFFER_PRICE
> OFFER_PRICE_LOG
> REV_CURRENT_YEAR.1
> REV_CURRENT_YEAR.2
> REV_CURRENT_YEAR_LOG.1
> REV_CURRENT_YEAR_LOG.2
> REV_PERCENTAGE_INCREASE
> REV_PERCENTAGE_INCREASE_NO_OUTLIER
> SERVICE_COST
> SERVICE_COST_LOG
> SERVICE_LIST_PRICE
> SERVICE_LIST_PRICE_LOG
> SINCE_CREATION_YEAR
> SINCE_CREATION_YEAR_LOG
> SO_CREATED_DATE_SCALED
> TOTAL_COST
> TOTAL_COSTS_PRODUCT
> TOTAL_COSTS_PRODUCT_LOG
--------------------------------
type: cat columns:
> BUSINESS_TYPE
> CURRENCY
> OFFER_TYPE
> OWNERSHIP
> OWNERSHIP_NA_AS_NO_INFO
> OWNERSHIP_NA_AS_NO_INFO_REDUCED
> OWNERSHIP_NO_INFO_AS_NA
> OWNERSHIP_NO_INFO_AS_NA_REDUCED
> OWNERSHIP_REDUCED
> PRICE_LIST
> SALES_LOCATION
> SALES_OFFICE
> TECH
> TECH_REDUCED_1
--------------------------------
type: int columns:
> BUSINESS_TYPE_BINENC_0
> BUSINESS_TYPE_BINENC_1
> BUSINESS_TYPE_BINENC_2
> BUSINESS_TYPE_BINENC_3
> CURRENCY_BINENC_0
> C

<IPython.core.display.Javascript object>

In [475]:
########### EXPERIMENT - START

<IPython.core.display.Javascript object>

In [476]:
numeric_cols = type_separator(df, False)["float"] + type_separator(df, False)["int"]

<IPython.core.display.Javascript object>

In [477]:
df1 = df[list(set(df.columns) - set(numeric_cols)) + ["OFFER_STATUS"]]

<IPython.core.display.Javascript object>

In [491]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


for i in range(10, len(colls)):
    print({*colls[:-i], "OFFER_STATUS"})
    # Feature, target arrays
    X, y = df1[list({*colls[:-i], "OFFER_STATUS"})], df1["OFFER_STATUS"]

    # Train/test set generation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1121218
    )

    # Scale train and test sets with StandardScaler
    # X_train_std = StandardScaler().fit_transform(X_train)
    # X_test_std = StandardScaler().fit_transform(X_test)

    # Fix the dimensions of the target array
    y_train = y_train.values.reshape(-1, 1)
    y_test = y_test.values.reshape(-1, 1)

    # Init, fit, test Lasso Regressor
    forest = RandomForestRegressor()
    _ = forest.fit(X_train_std, y_train.ravel())
    print(forest.score(X_test_std, y_test))

{'OFFER_STATUS', 'SALES_LOCATION', 'TECH_REDUCED_1', 'OWNERSHIP_NA_AS_NO_INFO_REDUCED', 'OFFER_TYPE'}
0.22176923450666608
{'SALES_LOCATION', 'TECH_REDUCED_1', 'OWNERSHIP_NA_AS_NO_INFO_REDUCED', 'OFFER_STATUS'}
0.22449106233103766
{'SALES_LOCATION', 'TECH_REDUCED_1', 'OFFER_STATUS'}
0.2235208618390584
{'SALES_LOCATION', 'OFFER_STATUS'}
0.2217585592766056


<IPython.core.display.Javascript object>

In [488]:
for i in range(1, len(colls) - 2):
    print({*colls[:-i], "OFFER_STATUS"})
    # Feature, target arrays
    X, y = df1[list({*colls[:-i], "OFFER_STATUS"})], df1["OFFER_STATUS"]
    break


{'OWNERSHIP_NO_INFO_AS_NA_REDUCED', 'OFFER_STATUS', 'OWNERSHIP_NA_AS_NO_INFO', 'OWNERSHIP_NO_INFO_AS_NA', 'TECH_REDUCED_1', 'OWNERSHIP', 'CURRENCY', 'PRICE_LIST', 'SALES_OFFICE', 'OWNERSHIP_REDUCED', 'SALES_LOCATION', 'TECH', 'OWNERSHIP_NA_AS_NO_INFO_REDUCED', 'OFFER_TYPE'}


<IPython.core.display.Javascript object>

In [527]:
aa = (
    type_separator(df, print_results=False)["float"]
    + type_separator(df, print_results=False)["int"]
)

<IPython.core.display.Javascript object>

In [533]:
sorted(list(set(aa) - {"OFFER_STATUS"}))

['ADDITIONAL_COST',
 'BUSINESS_TYPE_BINENC_0',
 'BUSINESS_TYPE_BINENC_1',
 'BUSINESS_TYPE_BINENC_2',
 'BUSINESS_TYPE_BINENC_3',
 'CREATION_YEAR',
 'CREATION_YEAR_LOG',
 'CURRENCY_BINENC_0',
 'CURRENCY_BINENC_1',
 'CURRENCY_BINENC_2',
 'HAS_COSTS_PRODUCT_A',
 'HAS_COSTS_PRODUCT_B',
 'HAS_COSTS_PRODUCT_C',
 'HAS_COSTS_PRODUCT_D',
 'HAS_COSTS_PRODUCT_E',
 'HAS_END_CUSTOMER',
 'HAS_ISIC',
 'IS_COUNTRY_CODE_CH',
 'IS_NA_CURRENCY',
 'IS_NA_OWNERSHIP_NO_INFO_AS_NA',
 'IS_NA_REV_PERCENTAGE_INCREASE',
 'IS_NA_SALES_LOCATION',
 'IS_NA_SALES_OFFICE',
 'MATERIAL_COST',
 'MATERIAL_COST_LOG',
 'OFFER_PRICE',
 'OFFER_PRICE_LOG',
 'OFFER_TYPE_BINENC_0',
 'OFFER_TYPE_BINENC_1',
 'OFFER_TYPE_BINENC_2',
 'OFFER_TYPE_BINENC_3',
 'OFFER_TYPE_BINENC_4',
 'OWNERSHIP_BINENC_0',
 'OWNERSHIP_BINENC_1',
 'OWNERSHIP_BINENC_2',
 'OWNERSHIP_NA_AS_NO_INFO_1HOTENC_Governmental',
 'OWNERSHIP_NA_AS_NO_INFO_1HOTENC_Individual Person',
 'OWNERSHIP_NA_AS_NO_INFO_1HOTENC_No information',
 'OWNERSHIP_NA_AS_NO_INFO_1HOTENC_P

<IPython.core.display.Javascript object>

In [587]:
X = df.drop("OFFER_STATUS", axis=1)

# for col in type_separator(X, print_results=False)["cat"]:
#     X[col] = X[col].astype("category")
X = X.drop(type_separator(X, print_results=False)["cat"], axis=1)
y = df["OFFER_STATUS"]

<IPython.core.display.Javascript object>

In [570]:
X["PRICE_LIST"]

0         SFT Standard
1        CMT Installer
2         SFT Standard
3         SFT Standard
4         SFT Standard
             ...      
26146     Tarif public
26147     Tarif public
26148     Tarif public
26149     Tarif public
26150     Tarif public
Name: PRICE_LIST, Length: 23575, dtype: category
Categories (4, object): ['CMT End Customer', 'CMT Installer', 'SFT Standard', 'Tarif public']

<IPython.core.display.Javascript object>

In [591]:
model.get_models_with_weights()

[(0.18,
  SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_classification', 'classifier:gradient_boosting:early_stop': 'valid', 'classifier:gradient_boosting:l2_regularization': 0.0036524366603721643, 'classifier:gradient_boosting:learning_rate': 0.06580081773518776, 'classifier:gradient_boosting:loss': 'auto', 'classifier:gradient_boosting:max_bins': 255, 'classifier:gradient_boosting:max_depth': 'None', 'classifier:gradient_boosting:max_leaf_nodes': 52, 'classifier:gradient_boosting:min_samples_leaf': 158, 'classifier:gradient_boosting:scoring': 'loss', 'classifier:gradient_boosting:tol': 1e-07, 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_prepr

<IPython.core.display.Javascript object>

In [592]:
model.leaderboard()


Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
741,1,0.18,gradient_boosting,0.282629,6.968305
529,2,0.02,gradient_boosting,0.285619,12.788782
671,3,0.02,gradient_boosting,0.285847,10.191306
728,4,0.06,gradient_boosting,0.286191,8.423153
1809,5,0.06,gradient_boosting,0.28631,8.843601
2116,6,0.02,gradient_boosting,0.287132,11.429441
1586,7,0.04,gradient_boosting,0.287402,3.894267
835,8,0.02,gradient_boosting,0.287641,6.830402
1660,9,0.08,gradient_boosting,0.287701,2.903776
1633,10,0.02,gradient_boosting,0.288067,15.005833


<IPython.core.display.Javascript object>

In [590]:
# example of auto-sklearn for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier
import autosklearn

# define dataset
# X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=1
)
# define search
model = AutoSklearnClassifier(
    time_left_for_this_task=6 * 60 * 60,
    per_run_time_limit=300,
    n_jobs=8,
    metric=autosklearn.metrics.balanced_accuracy,
)
# perform the search
model.fit(X_train, y_train)
# summarize
print(model.sprint_statistics())
# evaluate best model
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print("Accuracy: %.3f" % acc)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37939 instead
  self.metafeatures = self.metafeatures.append(metafeatures)
  self.algorithm_runs[metric].append(runs)
init_dgesdd failed init
init_dgesdd failed init
init_dgesdd failed init
init_dgesdd failed init
init_dgesdd failed init
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainabilit

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


auto-sklearn results:
  Dataset name: 60eaedc7-7e49-11ec-aac5-1159e6807194
  Metric: balanced_accuracy
  Best validation score: 0.717371
  Number of target algorithm runs: 2158
  Number of successful target algorithm runs: 1956
  Number of crashed target algorithm runs: 49
  Number of target algorithms that exceeded the time limit: 53
  Number of target algorithms that exceeded the memory limit: 100





Accuracy: 0.770




<IPython.core.display.Javascript object>

In [598]:
automl = model
print(automl.automl_.runhistory_)
print(len(automl.automl_.runhistory_.data))

np.argmax(automl.cv_results_["mean_test_score"])
automl.cv_results_["params"]

<smac.runhistory.runhistory.RunHistory object at 0x7f691abf18e0>
2159


[{'balancing:strategy': 'none',
  'classifier:__choice__': 'random_forest',
  'data_preprocessor:__choice__': 'feature_type',
  'feature_preprocessor:__choice__': 'no_preprocessing',
  'classifier:random_forest:bootstrap': 'True',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.5,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 1,
  'classifier:random_forest:min_samples_split': 2,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerica

<IPython.core.display.Javascript object>

In [548]:
# X = df.drop("OFFER_STATUS", axis=1)
X = df[list(set(first_ten) - set("OFFER_STATUS"))]
# X = df[list(set(first_ten) - {"OFFER_STATUS"})]
y = df["OFFER_STATUS"]
# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1121218
)

# Scale train and test sets with StandardScaler
# X_train = StandardScaler().fit_transform(X_train)
# X_test = StandardScaler().fit_transform(X_test)


# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Init, fit, test Lasso Regressor
forest = RandomForestRegressor()
_ = forest.fit(X_train, y_train.ravel())
forest.score(X_test, y_test)

0.06742862646027248

<IPython.core.display.Javascript object>

In [544]:
first_ten = [item[1] for item in xt[:5]]

<IPython.core.display.Javascript object>

In [546]:
xt = sorted(list(zip(abs(forest.feature_importances_), X_train.columns)))[::-1]

xt

[(0.22380966606572658, 'SO_CREATED_DATE_SCALED'),
 (0.21723226268268017, 'SERVICE_COST'),
 (0.20476697992678963, 'TOTAL_COST'),
 (0.20110306247676482, 'ADDITIONAL_COST'),
 (0.15308802884803888, 'REV_PERCENTAGE_INCREASE_NO_OUTLIER')]

<IPython.core.display.Javascript object>

In [439]:
df1.columns

Index(['OWNERSHIP_NO_INFO_AS_NA_REDUCED', 'OWNERSHIP_NA_AS_NO_INFO',
       'CURRENCY', 'TECH_REDUCED_1', 'OWNERSHIP', 'OWNERSHIP_NO_INFO_AS_NA',
       'PRICE_LIST', 'SALES_OFFICE', 'OWNERSHIP_REDUCED', 'BUSINESS_TYPE',
       'SALES_LOCATION', 'TECH', 'OWNERSHIP_NA_AS_NO_INFO_REDUCED',
       'OFFER_TYPE', 'OFFER_STATUS'],
      dtype='object')

<IPython.core.display.Javascript object>

[(0.12290675855464189, 'REV_CURRENT_YEAR.1'),
 (0.07016688472090941, 'HAS_COSTS_PRODUCT_D'),
 (0.05463101909178812, 'HAS_COSTS_PRODUCT_C'),
 (0.046174648579620683, 'HAS_COSTS_PRODUCT_E'),
 (0.04443267306364587, 'SERVICE_COST'),
 (0.043981434925098616, 'CREATION_YEAR'),
 (0.03966812089639349, 'HAS_ISIC'),
 (0.030438835867286403, 'IS_COUNTRY_CODE_CH'),
 (0.030201485269018134, 'OWNERSHIP'),
 (0.029812686124779175, 'OFFER_PRICE'),
 (0.027682007848185513, 'TECH'),
 (0.02707608288719452, 'PRICE_LIST'),
 (0.026895889496058704, 'ADDITIONAL_COST'),
 (0.026008611502715443, 'CURRENCY'),
 (0.025532797949825076, 'SERVICE_LIST_PRICE'),
 (0.024292608453405137, 'HAS_END_CUSTOMER'),
 (0.024196899068896518, 'MATERIAL_COST'),
 (0.023306262777168678, 'REV_PERCENTAGE_INCREASE_NO_OUTLIER'),
 (0.016440680484550708, 'PRICE_LIST_1HOTENC_CMT Installer'),
 (0.012444697685431903, 'BUSINESS_TYPE'),
 (0.012240779163827962, 'REV_CURRENT_YEAR.2'),
 (0.012094128104882154, 'IS_NA_SALES_LOCATION'),
 (0.01113592295266805

<IPython.core.display.Javascript object>

In [438]:
weights.sort_values("weight", ascending=False)

Unnamed: 0,feature,weight
13,SALES_LOCATION,0.122044
12,TECH_REDUCED_1,0.044816
11,OWNERSHIP_NA_AS_NO_INFO_REDUCED,0.043821
10,OFFER_TYPE,0.029803
9,OWNERSHIP_NO_INFO_AS_NA_REDUCED,0.029777
8,OWNERSHIP,0.028291
7,OWNERSHIP_NO_INFO_AS_NA,0.027324
6,OWNERSHIP_NA_AS_NO_INFO,0.026689
5,CURRENCY,0.024743
4,SALES_OFFICE,0.012637


<IPython.core.display.Javascript object>

In [493]:
from sklearn.feature_selection import RFE

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=10)

# Fit to the training data
_ = rfe.fit(X_train, y_train
           )


ValueError: could not convert string to float: 'St. Gallen East'

<IPython.core.display.Javascript object>

In [433]:
X_train.loc[:, rfe.support_]

AttributeError: 'RFE' object has no attribute 'support_'

<IPython.core.display.Javascript object>

In [420]:
# Import your necessary dependencies
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

# First things first
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X, Y)

Ridge(
    alpha=1.0,
    copy_X=True,
    fit_intercept=True,
    max_iter=None,
    normalize=False,
    random_state=None,
    solver="auto",
    tol=0.001,
)

TypeError: __init__() takes 2 positional arguments but 3 were given

<IPython.core.display.Javascript object>

In [419]:
from sklearn.feature_selection import RFE

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=10)

# Fit to the training data
_ = rfe.fit(X_train_std, y_train.ravel())

  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)


KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [364]:
########### EXPERIMENT - STOP

<IPython.core.display.Javascript object>

In [362]:
# Import your necessary dependencies
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

<IPython.core.display.Javascript object>

In [363]:
# Feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

TypeError: __init__() takes 2 positional arguments but 3 were given

<IPython.core.display.Javascript object>

In [231]:
df = df_backup.copy()
mlb = MultiLabelBinarizer()

dfx = df.join(pd.DataFrame(mlb.fit_transform(df.pop('OFFER_TYPE_REDUCED_1')),
                          columns=mlb.classes_,
                          index=df.index))


<IPython.core.display.Javascript object>

In [265]:
df = df_backup.copy()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [279]:
len(df_binary.columns)

5

<IPython.core.display.Javascript object>

In [268]:
col = "OFFER_TYPE_REDUCED_1"
trimmed_col = col.strip().replace(" ", "_")
f


Unnamed: 0,OFFER_PRICE,SERVICE_LIST_PRICE,MATERIAL_COST,SERVICE_COST,PRICE_LIST,TECH,OFFER_TYPE,BUSINESS_TYPE,OFFER_STATUS,SALES_LOCATION,...,IS_NA_SALES_OFFICE,IS_NA_CURRENCY,IS_NA_OWNERSHIP_NO_INFO_AS_NA,ADDITIONAL_COST,TOTAL_COST,OFFER_TYPE_REDUCED_1_BINENC_0,OFFER_TYPE_REDUCED_1_BINENC_1,OFFER_TYPE_REDUCED_1_BINENC_2,OFFER_TYPE_REDUCED_1_BINENC_3,OFFER_TYPE_REDUCED_1_BINENC_4
0,1711.00,1395.00,1107.0,186.30,SFT Standard,S,IN,E,0.0,Luzern Central,...,0,0,0,417.70,1293.30,0,0,0,0,1
1,26687.60,14651.00,9282.0,7768.34,CMT Installer,C,D,N,0.0,Zürich East,...,0,0,0,9637.26,17050.34,0,0,0,1,0
2,6264.70,2296.00,1722.0,2168.56,SFT Standard,F,FIR,E,1.0,Luzern Central,...,0,1,1,2374.14,3890.56,0,0,0,1,1
3,4300.20,310.00,246.0,2775.92,SFT Standard,F,FIR,M,1.0,Basel Central,...,0,0,0,1278.28,3021.92,0,0,0,1,1
4,13693.00,5815.00,4674.0,4179.38,SFT Standard,F,FIR,E,1.0,Geneva West,...,0,0,0,4839.62,8853.38,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26146,14200.00,5532.00,3541.0,8855.55,Tarif public,F,FDI,Mig,1.0,Metz Grand Est,...,0,0,0,1803.45,12396.55,1,0,0,0,0
26147,5140.51,3626.15,3074.8,828.14,Tarif public,F,FDD,New,1.0,Rouen Nord FR,...,0,0,0,1237.57,3902.94,0,1,1,1,0
26148,2672.00,2672.00,1731.2,0.00,Tarif public,F,FDI,Exp,1.0,Grenoble Centre-Est,...,0,0,0,940.80,1731.20,1,0,0,0,0
26149,16961.99,12345.00,8925.2,1878.25,Tarif public,F,FDD,Mig,0.0,Dijon Centre-Est,...,0,0,0,6158.54,10803.45,0,1,1,1,0


<IPython.core.display.Javascript object>

In [250]:
x.shape

(23575,)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [197]:
df_one

Unnamed: 0,asa_asaa_Sold_BP,asa_asaa_Sold_C,asa_asaa_Sold_E,asa_asaa_Sold_EPS,asa_asaa_Sold_F,asa_asaa_Sold_FP,asa_asaa_Sold_S
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...
26146,0,0,0,0,1,0,0
26147,0,0,0,0,1,0,0
26148,0,0,0,0,1,0,0
26149,0,0,0,0,1,0,0


<IPython.core.display.Javascript object>

In [599]:
### SAFE ZONE

<IPython.core.display.Javascript object>

In [601]:
model.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
741,1,0.18,gradient_boosting,0.282629,6.968305
529,2,0.02,gradient_boosting,0.285619,12.788782
671,3,0.02,gradient_boosting,0.285847,10.191306
728,4,0.06,gradient_boosting,0.286191,8.423153
1809,5,0.06,gradient_boosting,0.28631,8.843601
2116,6,0.02,gradient_boosting,0.287132,11.429441
1586,7,0.04,gradient_boosting,0.287402,3.894267
835,8,0.02,gradient_boosting,0.287641,6.830402
1660,9,0.08,gradient_boosting,0.287701,2.903776
1633,10,0.02,gradient_boosting,0.288067,15.005833


<IPython.core.display.Javascript object>

In [602]:
model.get_models_with_weights()[0]

(0.18,
 SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_classification', 'classifier:gradient_boosting:early_stop': 'valid', 'classifier:gradient_boosting:l2_regularization': 0.0036524366603721643, 'classifier:gradient_boosting:learning_rate': 0.06580081773518776, 'classifier:gradient_boosting:loss': 'auto', 'classifier:gradient_boosting:max_bins': 255, 'classifier:gradient_boosting:max_depth': 'None', 'classifier:gradient_boosting:max_leaf_nodes': 52, 'classifier:gradient_boosting:min_samples_leaf': 158, 'classifier:gradient_boosting:scoring': 'loss', 'classifier:gradient_boosting:tol': 1e-07, 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preproc

<IPython.core.display.Javascript object>

In [612]:
model.show_models()

{741: {'model_id': 741,
  'rank': 1,
  'cost': 0.2826285085053043,
  'ensemble_weight': 0.18,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f67f3406520>,
  'balancing': Balancing(random_state=1, strategy='weighting'),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f67f80f0100>,
  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7f67f13fceb0>,
  'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                                 l2_regularization=0.0036524366603721643,
                                 learning_rate=0.06580081773518776, max_iter=128,
                                 max_leaf_nodes=52, min_samples_leaf=158,
                                 random_state=1,
                                 validation_fraction=0.09336099626821685,
                                 warm_start=True)},
 529: {'model_i

<IPython.core.display.Javascript object>

## Elapsed Time: 121.07330274581909, Final BAC Score: 0.717

In [15]:
# Read Data
df = pd.read_csv("./interim_data/df_completed_1_2_3.csv")

id_columns = ["CUSTOMER", "TEST_SET_ID", "IDX_CUSTOMER"]
unnecessary_reduced_cols = [
    "OFFER_TYPE_REDUCED_1",
    "OFFER_TYPE_REDUCED_2",
    "SALES_OFFICE_REDUCED",
]

to_be_dropped_cols = id_columns + unnecessary_reduced_cols
df = df.drop(to_be_dropped_cols, axis=1)

df = df[~np.isnan(df["OFFER_STATUS"])]
df["OFFER_STATUS"] = df["OFFER_STATUS"].astype(np.dtype("int"))
# new columns
df["ADDITIONAL_COST"] = df["OFFER_PRICE"] - df["MATERIAL_COST"] - df["SERVICE_COST"]
df["TOTAL_COST"] = df["MATERIAL_COST"] + df["SERVICE_COST"]


###################33

type_to_col = type_separator(df, print_results=False)
for col in type_to_col["cat"]:
    num_unq = len(df[col].unique())
    trimmed_col = col.strip().replace(" ", "_")
    if num_unq < 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying 1-HOT encoding.")
        onehot_df = pd.get_dummies(df[col])
        onehot_df = onehot_df.add_prefix(trimmed_col + "_1HOTENC_")
        df = pd.concat((df, onehot_df), axis=1)
    elif num_unq >= 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying BINARY encoding.")
        encoder = ce.BinaryEncoder(cols=[col])
        binenc_df = encoder.fit_transform(df[[col]])
        binenc_df.columns = [
            f"{trimmed_col}_BINENC_{i}" for i in range(len(binenc_df.columns))
        ]
        df = pd.concat((df, binenc_df), axis=1)
        
for col in type_separator(df, print_results=False)["cat"]:
    df[col] = pd.Categorical(df[col])

##########################3


# ensemble_size 1


# example of auto-sklearn for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier


# Col Selection & Conversion
X, y = df.drop("OFFER_STATUS", axis=1), df["OFFER_STATUS"]


# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


################## APPLY SMOTE -START
X_train = X_train[type_separator(X_train,False)["numerical"]]
# Col Selection & Conversion
#sm = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=100)
#X_train, y_train = sm.fit_resample(X_train, y_train)
################## APPLY SMOTE - STOP

# define search
model2 = AutoSklearnClassifier(
    time_left_for_this_task=2 * 60,
    seed=1,
    per_run_time_limit=30,
    n_jobs=8,
    metric=autosklearn.metrics.balanced_accuracy,
    resampling_strategy_arguments = {'cv': {'folds': 10},"shuffle":True}
)

start = time.time()
model2.fit(X_train, y_train)
end = time.time()
y_hat = model2.predict(X_test)

print(model2.sprint_statistics())
print(model2.get_models_with_weights())
model2.leaderboard()

# Evaluate
val_acc = balanced_accuracy_score(y_true=y_test, y_pred=model2.predict(X_test))
print("Elapsed Time:", end - start)
print("Final BAC Score: %.3f" % val_acc)

# Pickle
file_name = f"model_pickles/model_{int(val_acc*10**3)}"
with open(file_name, "wb") as f:
    pickle.dump(model2, f)
with open(file_name, "rb") as f:
    loaded_model = pickle.load(f)

# Confirm Pickle
balanced_accuracy_score(y_true=y_test, y_pred=loaded_model.predict(X_test))

[INFO] Col:PRICE_LIST,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH,num_of_unq:7, applying BINARY encoding.
[INFO] Col:OFFER_TYPE,num_of_unq:30, applying BINARY encoding.
[INFO] Col:BUSINESS_TYPE,num_of_unq:11, applying BINARY encoding.
[INFO] Col:SALES_LOCATION,num_of_unq:44, applying BINARY encoding.
[INFO] Col:SALES_OFFICE,num_of_unq:38, applying BINARY encoding.
[INFO] Col:OWNERSHIP,num_of_unq:5, applying BINARY encoding.
[INFO] Col:CURRENCY,num_of_unq:5, applying BINARY encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH_REDUCED_1,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO_REDUCED,num_of_unq:2, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA_REDUCED,num_of_unq:3, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_REDUCED,num_of_unq:3, applying 1-HOT encoding.


## Predictt

In [45]:
# Read Data
df = pd.read_csv("./interim_data/df_completed_1_2_3.csv")

id_columns = ["CUSTOMER", 
              #"TEST_SET_ID", 
              "IDX_CUSTOMER"]
unnecessary_reduced_cols = [
    "OFFER_TYPE_REDUCED_1",
    "OFFER_TYPE_REDUCED_2",
    "SALES_OFFICE_REDUCED",
]

to_be_dropped_cols = id_columns + unnecessary_reduced_cols
df = df.drop(to_be_dropped_cols, axis=1)

# df = df[~np.isnan(df["OFFER_STATUS"])]
# df["OFFER_STATUS"] = df["OFFER_STATUS"].astype(np.dtype("int"))
# new columns
df["ADDITIONAL_COST"] = df["OFFER_PRICE"] - df["MATERIAL_COST"] - df["SERVICE_COST"]
df["TOTAL_COST"] = df["MATERIAL_COST"] + df["SERVICE_COST"]


<IPython.core.display.Javascript object>

In [46]:
###################33

type_to_col = type_separator(df, print_results=False)
for col in type_to_col["cat"]:
    num_unq = len(df[col].unique())
    trimmed_col = col.strip().replace(" ", "_")
    if num_unq < 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying 1-HOT encoding.")
        onehot_df = pd.get_dummies(df[col])
        onehot_df = onehot_df.add_prefix(trimmed_col + "_1HOTENC_")
        df = pd.concat((df, onehot_df), axis=1)
    elif num_unq >= 5:
        print(f"[INFO] Col:{col},num_of_unq:{num_unq}, applying BINARY encoding.")
        encoder = ce.BinaryEncoder(cols=[col])
        binenc_df = encoder.fit_transform(df[[col]])
        binenc_df.columns = [
            f"{trimmed_col}_BINENC_{i}" for i in range(len(binenc_df.columns))
        ]
        df = pd.concat((df, binenc_df), axis=1)
        
for col in type_separator(df, print_results=False)["cat"]:
    df[col] = pd.Categorical(df[col])

##########################3



[INFO] Col:PRICE_LIST,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH,num_of_unq:7, applying BINARY encoding.
[INFO] Col:OFFER_TYPE,num_of_unq:30, applying BINARY encoding.
[INFO] Col:BUSINESS_TYPE,num_of_unq:11, applying BINARY encoding.
[INFO] Col:SALES_LOCATION,num_of_unq:45, applying BINARY encoding.
[INFO] Col:SALES_OFFICE,num_of_unq:38, applying BINARY encoding.
[INFO] Col:OWNERSHIP,num_of_unq:5, applying BINARY encoding.
[INFO] Col:CURRENCY,num_of_unq:5, applying BINARY encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:TECH_REDUCED_1,num_of_unq:4, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NA_AS_NO_INFO_REDUCED,num_of_unq:2, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_NO_INFO_AS_NA_REDUCED,num_of_unq:3, applying 1-HOT encoding.
[INFO] Col:OWNERSHIP_REDUCED,num_of_unq:3, applying 1-HOT encoding.


<IPython.core.display.Javascript object>

In [54]:
test_df = df[np.isnan(df["OFFER_STATUS"])]

with open("model_pickles/model_717", "rb") as f:
    loaded_model = pickle.load(f)

test_df["prediction"] = loaded_model.predict(test_df)

test_df = test_df[["TEST_SET_ID", "prediction"]]
test_df = test_df.rename(columns={"TEST_SET_ID": "id"})
test_df["id"] = test_df["id"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = loaded_model.predict(test_df)


<IPython.core.display.Javascript object>

In [56]:
test_df.to_csv("pred_with_0717.csv", header=True, index=False)

<IPython.core.display.Javascript object>

In [None]:

# ensemble_size 1


# example of auto-sklearn for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier


# Col Selection & Conversion
X, y = df.drop("OFFER_STATUS", axis=1), df["OFFER_STATUS"]


# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


################## APPLY SMOTE -START
X_train = X_train[type_separator(X_train,False)["numerical"]]
# Col Selection & Conversion
#sm = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=100)
#X_train, y_train = sm.fit_resample(X_train, y_train)
################## APPLY SMOTE - STOP

# define search
model2 = AutoSklearnClassifier(
    time_left_for_this_task=2 * 60,
    seed=1,
    per_run_time_limit=30,
    n_jobs=8,
    metric=autosklearn.metrics.balanced_accuracy,
    resampling_strategy_arguments = {'cv': {'folds': 10},"shuffle":True}
)

start = time.time()
model2.fit(X_train, y_train)
end = time.time()
y_hat = model2.predict(X_test)

print(model2.sprint_statistics())
print(model2.get_models_with_weights())
model2.leaderboard()

# Evaluate
val_acc = balanced_accuracy_score(y_true=y_test, y_pred=model2.predict(X_test))
print("Elapsed Time:", end - start)
print("Final BAC Score: %.3f" % val_acc)

# Pickle
file_name = f"model_pickles/model_{int(val_acc*10**3)}"
with open(file_name, "wb") as f:
    pickle.dump(model2, f)
with open(file_name, "rb") as f:
    loaded_model = pickle.load(f)

# Confirm Pickle
balanced_accuracy_score(y_true=y_test, y_pred=loaded_model.predict(X_test))