# DATA PULL directly from Kaggle with API Key


In [None]:
import kaggle

def setup_kaggle_credentials():
    import os
    from pathlib import Path
    from kaggle.api.kaggle_api_extended import KaggleApi

    # Path to the kaggle.json
    # Download API KEY JSON from Kaggle
    kaggle_api_path = Path.home() / '.kaggle' / 'kaggle.json'
    os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_api_path.parent)

    if os.name != 'nt':  # Not Windows
        kaggle_api_path.chmod(0o600)

    api = KaggleApi()
    api.authenticate()

def download_dataset(dataset_path):
    kaggle.api.dataset_download_files(dataset_path, path='./', unzip=True)

# Set up credentials
setup_kaggle_credentials()

# Download dataset
dataset_path = 'dataset-owner/dataset-name'
download_dataset(dataset_path)


# Extract ZIP Files

In [52]:
import zipfile
import pandas as pd

with zipfile.ZipFile("/content/UCI_Credit_Card.csv.zip","r") as z:
    for filename in z.namelist():
        if filename.endswith(".csv"):
            with z.open(filename) as path:
                data = pd.read_csv(path)



# AutoML Classification

# Feature Engineering for Classification


*   Pearson Correlation among the features
*   Mutual Info Gain of the features from above with target



In [53]:
# CELL
import pandas as pd
import numpy as np

# CELL
# corr_series = data.corr().loc['default.payment.next.month',:][:-1].sort_values(key=lambda x: abs(x),ascending = False,)
target_column = "default.payment.next.month"
df_corr = data.drop(columns = [target_column,"ID"]).corr()
correlation_drop_threshold = 0.9
df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), k = 0).astype(bool))
df_corr = df_corr.reset_index().melt(id_vars = "index", var_name = "feature", value_name  = "correlation_score")
df_corr = df_corr[(df_corr["correlation_score"]!= 1)&
        (df_corr["correlation_score"]>=0.8)]
df_corr
#Logic to find similar features and drop them
chain_lst = []

for feature_1, feature_2 in zip(df_corr['index'], df_corr['feature']):
    if len(chain_lst)==0:
        lst = [feature_1,feature_2]
        chain_lst.append(lst)
    else:
        #Check for feature
        check = 0
        for i,lst in enumerate(chain_lst):
            if (feature_1 in lst) or (feature_2 in lst):
                lst = [feature_1,feature_2]
                chain_lst[i] = chain_lst[i] + lst
                chain_lst[i] = list(set(chain_lst[i]))
                check = check + 1
                break
        if check == 0:
            lst = [feature_1,feature_2]
            chain_lst.append(lst)

feature_drop_lst =["ID"]
for lst in chain_lst:
  feature_drop_lst = feature_drop_lst + lst[1:]
data = data.drop(columns = feature_drop_lst)

X = data.loc[:,data.columns != target_column]
y = data[target_column]

from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X,y.astype(int))
feat_importances = pd.Series(importances,X.columns)

result['Information_gain'] = feat_importances

info_gain_threshold = 0.005
drop_feature_lst = result[result["Information_gain"]<info_gain_threshold].index
data = data.drop(columns = drop_feature_lst)

# Train and Upload to S3

In [57]:
# CELL
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
X = data.drop(columns = [target_column])


# CELL
from sklearn.model_selection import train_test_split
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size=0.3, random_state=42)

try:
    from flaml import AutoML
except:
    !pip install flaml
    !pip install catboost
    from flaml import AutoML

automl = AutoML()
models = ['rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
# settings = {
#     "time_budget": time_budget,  # total running time in seconds
#     "metric": metric,
#     "estimator_list": ['lgbm','rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1'],
#                         # check the documentation for options of metrics (https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric)
#     "task": 'classification',  # task type
#     "log_file_name": 'airlines_experiment.log',  # flaml log file
#     "seed": 42,    # random seed
# }


# automl.fit(X_train=X_train, y_train=y_train, metric = score,task = 'classification',estimator_list = models.split(','),time_budget = time,verbose = 0,seed = 42)
automl.fit(X_train=X_train, y_train=y_train, metric = "f1",task = 'classification',estimator_list = models,time_budget = 300,verbose = 0)

result1 = pd.DataFrame(1 - pd.Series(automl.best_loss_per_estimator),columns = ['metric']).reset_index(inplace = False).rename(columns = {'index':'models'})
result_target = pd.DataFrame()
result_target['Id'] = [val for val in range(len(y_test))]
result_target[target_column] = y_test.to_list()
result_target[f'Predicted_{target_column}'] = automl.predict(X_test)
result = pd.concat([result1,result_target],axis = 1)



INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


In [None]:
# CELL
import os
import pickle
import hashlib

#!pip install boto3
import boto3
import pandas as pd
import io
from sklearn.model_selection import train_test_split


# CELL
 BUCKET_PATH = "Enter Bucket Name Here"
s3_dir_name = "Enter Directory Name Here"
client_name = "Enter Folder Name Here"
STAGING_PATH = "/tmp/staging/common/"

# CELL
def dump_object(python_object, path):
    # Obtain directory path from input path
    directory_path = os.path.dirname(path)
    # If directory does not exist, create it
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    # Dump
    with open(path, "wb") as output_file:
        pickle.dump(python_object, output_file)

# CELL
def upload(bucket,path,s3_path,metadata={},config=None,):
    # Create a client

    client = boto3.client("s3")
    # Upload a file
    client.upload_file(
        path, bucket, s3_path, ExtraArgs={"Metadata": metadata}, Config=config
    )

# CELL
def delete_file(path):
    try:
        os.remove(path)
    except OSError:
        pass

# CELL
def upload_object(
    python_object,
    bucket,
    s3_path,
    metadata={},
    config=None,
):
    # Define a temporary path
    temporary_path = os.path.join(
        STAGING_PATH,
        generate_temporary_filename(path=s3_path, extension="pickle"),
    )
    # Dump Python object to temporary pickle file
    dump_object(python_object=python_object, path=temporary_path)
    # Upload pickle file
    upload(
        bucket=bucket,
        path=temporary_path,
        s3_path=s3_path,
        metadata=metadata,
        config=config,
    )
    # Clean up temporary pickle file
    delete_file(path=temporary_path)

# CELL
def generate_temporary_filename(path, extension):
    filename = hashlib.sha256(path.encode("utf-8")).hexdigest()
    return "%s.%s" % (filename, extension)

# CELL
def generate_temporary_directory(path):
    directory = hashlib.sha256(path.encode("utf-8")).hexdigest()
    return directory

# CELL
s3_path = f"{s3_dir_name}/{client_name}/automl.pkl"


# CELL
upload_object(
    python_object=automl.model,
    bucket=BUCKET_PATH,
    s3_path=s3_path,
    metadata={},
    config=None,
)



# Pull the model from Amazon S3 and Predict

In [None]:
import os
import pickle
import hashlib
#!pip install boto3
import boto3
import botocore
import pandas as pd
import io
import numpy as np
import random
from datetime import datetime, timedelta

# CELL
 BUCKET_PATH = "Enter Bucket Name Here"
s3_dir_name = "Enter Directory Name Here"
client_name = "Enter Folder Name Here"
STAGING_PATH = "/tmp/staging/common/"

def download(
    bucket,
    path,
    s3_path,
    config=None,

):
    # Create a client

    client = boto3.client("s3")

    # Download a file
    try:
        # Obtain directory path from input path
        directory_path = os.path.dirname(path)
        # If directory does not exist, create it
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
        # Download
        client.download_file(bucket, s3_path, path, Config=config)
    except botocore.exceptions.ClientError:
        raise NoItemFoundException()


def download_object(
    bucket,
    s3_path,
    config=None,
):
    # Define a temporary path
    temporary_path = os.path.join(
        STAGING_PATH,
        generate_temporary_filename(path=s3_path, extension="pickle"),
    )
    # Download pickle file into temporary location
    download(
        bucket=bucket,
        path=temporary_path,
        s3_path=s3_path,
        config=config,
    )
    # Load pickle file as Python object
    python_object = load_object(path=temporary_path)
    # Clean up temporary pickle file
    delete_file(path=temporary_path)

    return python_object

def delete_file(path):
    try:
        os.remove(path)
    except OSError:
        pass

def load_object(path):
    with open(path, "rb") as input_file:
        return pickle.load(input_file)

def generate_temporary_filename(path, extension):
    filename = hashlib.sha256(path.encode("utf-8")).hexdigest()
    return "%s.%s" % (filename, extension)


def generate_temporary_directory(path):
    directory = hashlib.sha256(path.encode("utf-8")).hexdigest()
    return directory


s3_path = f"{s3_dir_name}/{client_name}/automl.pkl"

# CELL
automl = download_object(
    BUCKET_PATH,
    s3_path)
from flaml import AutoML


X = data['input']
# X = X.drop(columns = [target_column])



result = pd.DataFrame()

result['Id'] = [val for val in range(1,len(X)+1)]

result[f'Predicted_{target_column}'] = automl.predict(X)



# Regression

In [None]:
# model_name = "model_1"
# overwrite = "False"
# model_description = ""
# target_column = "math score"
# feature_columns = "gender,race/ethnicity,parental level of education,lunch,test preparation course"#.split(",")
# model_list = "lgbm,rf,catboost,xgboost,extra_tree,xgb_limitdepth"#.split(",")
# # model_list = "lgbm,rf,xgboost,extra_tree,xgb_limitdepth"#.split(",")


# time_budget = 100
# test_size = 0.25
# r2_rmse_mse_mae = "r2"




if (overwrite).lower()=="true":
    # !pip install catboost
    # !pip install "flaml[automl]"

    import pandas as pd
    import catboost

    from flaml import AutoML
    from sklearn.model_selection import train_test_split
    from datetime import datetime, timedelta

    # data = pd.read_csv(path, low_memory = False)

    automl = AutoML()

    automl_settings = {
        "time_budget": time_budget,  # in seconds
        "metric": metric ,
        "estimator_list":model_list.split(","),
        "task": 'regression',
        "log_file_name": "flaml.log",
    }

    X_train, X_test, y_train, y_test = train_test_split(data[feature_columns.split(",")], data[target_column], test_size = test_size)
    automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings, verbose = 0)

    model_log = pd.DataFrame()
    model_log["train_date"] = [str(datetime.now().date())]
    model_log["train_time"] = [str(datetime.now().time())]
    model_log["model_name"] = [model_name]
    model_log["model_description"] = [model_description]
    model_log["best_model"] = [automl.best_estimator]
    if metric == "r2":
      model_log["model_metrics"] = [f"{metric} : {round(1 - automl.best_loss,2)}"]
      model_log["model_hyperparameters"] = [automl.best_config]
      model_log["estimator_list"] = [model_list]

      model_log["best_loss_per_estimator"] = [(1 - pd.Series(automl.best_loss_per_estimator)).to_dict()]
    else:
      model_log["model_metrics"] = [f"{metric} : {round(automl.best_loss,2)}"]
      model_log["model_hyperparameters"] = [automl.best_config]
      model_log["estimator_list"] = [model_list]

      model_log["best_loss_per_estimator"] = [pd.Series(automl.best_loss_per_estimator).to_dict()]
    #model_log["model_hyperparameters"] = [automl.best_config]
    #model_log["estimator_list"] = [model_list]

    model_log["time_budget"] = [time_budget]
    model_log["test_size"] = [test_size]


    df_feature_imp = pd.DataFrame()
    df_feature_imp['Features'] = list(automl.feature_names_in_)
    df_feature_imp['Importance Score'] = list(automl.feature_importances_)
    df_feature_imp = df_feature_imp.sort_values(by = ['Importance Score'], ascending = False)

    df_pred = pd.DataFrame()
    df_pred = data.copy()
    df_pred[f"pred_{target_column}"] = automl.predict(data[feature_columns.split(",")])
    result = {"log":model_log,
      "feature_importance":df_feature_imp,
      "preds":df_pred
    }

    import os
    import pickle
    import hashlib

    #!pip install boto3
    import boto3
    import io


    # CELL
    BUCKET_PATH = "Enter Bucket Name Here"
    s3_dir_name = "Enter Directory Name Here"
    client_name = "Enter Folder Name Here"
    STAGING_PATH = "/tmp/staging/common/"

    # CELL
    def dump_object(python_object, path):
        # Obtain directory path from input path
        directory_path = os.path.dirname(path)
        # If directory does not exist, create it
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
        # Dump
        with open(path, "wb") as output_file:
            pickle.dump(python_object, output_file)

    # CELL
    def upload(bucket,path,s3_path,metadata={},config=None,):
        # Create a client

        client = boto3.client("s3")
        # Upload a file
        client.upload_file(
            path, bucket, s3_path, ExtraArgs={"Metadata": metadata}, Config=config
        )

    # CELL
    def delete_file(path):
        try:
            os.remove(path)
        except OSError:
            pass

    # CELL
    def upload_object(
        python_object,
        bucket,
        s3_path,
        metadata={},
        config=None,
    ):
        # Define a temporary path
        temporary_path = os.path.join(
            STAGING_PATH,
            generate_temporary_filename(path=s3_path, extension="pickle"),
        )
        # Dump Python object to temporary pickle file
        dump_object(python_object=python_object, path=temporary_path)
        # Upload pickle file
        upload(
            bucket=bucket,
            path=temporary_path,
            s3_path=s3_path,
            metadata=metadata,
            config=config,
        )
        # Clean up temporary pickle file
        delete_file(path=temporary_path)

    # CELL
    def generate_temporary_filename(path, extension):
        filename = hashlib.sha256(path.encode("utf-8")).hexdigest()
        return "%s.%s" % (filename, extension)

    # CELL
    def generate_temporary_directory(path):
        directory = hashlib.sha256(path.encode("utf-8")).hexdigest()
        return directory

    # CELL
    s3_path = f"{s3_dir_name}/{client_name}/{model_name}.pkl"


    # CELL
    upload_object(
        python_object=automl,
        bucket=BUCKET_PATH,
        s3_path=s3_path,
        metadata={},
        config=None,
    )

else:
    import os
    import pickle
    import hashlib
    #!pip install boto3
    import boto3
    import botocore
    import pandas as pd
    import io
    import numpy as np
    import random
    from datetime import datetime, timedelta

    # CELL
    BUCKET_PATH = "Enter Bucket Name Here"
    s3_dir_name = "Enter Directory Name Here"
    client_name = "Enter Folder Name Here"
    STAGING_PATH = "/tmp/staging/common/"

    def download(
        bucket,
        path,
        s3_path,
        config=None,

    ):
        # Create a client

        client = boto3.client("s3")

        # Download a file
        try:
            # Obtain directory path from input path
            directory_path = os.path.dirname(path)
            # If directory does not exist, create it
            if not os.path.exists(directory_path):
                os.makedirs(directory_path)
            # Download
            client.download_file(bucket, s3_path, path, Config=config)
        except botocore.exceptions.ClientError:
            raise NoItemFoundException()


    def download_object(
        bucket,
        s3_path,
        config=None,
    ):
        # Define a temporary path
        temporary_path = os.path.join(
            STAGING_PATH,
            generate_temporary_filename(path=s3_path, extension="pickle"),
        )
        # Download pickle file into temporary location
        download(
            bucket=bucket,
            path=temporary_path,
            s3_path=s3_path,
            config=config,
        )
        # Load pickle file as Python object
        python_object = load_object(path=temporary_path)
        # Clean up temporary pickle file
        delete_file(path=temporary_path)

        return python_object

    def delete_file(path):
        try:
            os.remove(path)
        except OSError:
            pass

    def load_object(path):
        with open(path, "rb") as input_file:
            return pickle.load(input_file)

    def generate_temporary_filename(path, extension):
        filename = hashlib.sha256(path.encode("utf-8")).hexdigest()
        return "%s.%s" % (filename, extension)


    def generate_temporary_directory(path):
        directory = hashlib.sha256(path.encode("utf-8")).hexdigest()
        return directory


    s3_path = f"{s3_dir_name}/{client_name}/{model_name}.pkl"

    # CELL
    automl = download_object(
        BUCKET_PATH,
        s3_path)
    from flaml import AutoML


    X = data[feature_columns.split(',')]



    result = data.copy()

    result[f"pred_{target_column}"] = automl.predict(X)