In [11]:
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV

In [5]:
from sklearn.base import TransformerMixin
import copy

# adapt the custom Scikit-learn transformer(in transformers) that transforms categorical features to numerical ones to one that can be used here
class CatNumTransformer(TransformerMixin):
    def __init__(self, cols, max_classes):
        self.cols = cols
        self.cat_mapping = {}
        self.max_classes = max_classes

    def fit(self, X, y=None, *args, **kwargs):
        # map each category to a numeric value; the numeric value is the maximum age found for that
        # category
        for col in self.cols:
            categories = X[col].unique()
            # create category-to-max age mapping
            mapping = dict(X.groupby(col).age_in_months.max())
            self.cat_mapping[col] = mapping
                   
        return self

    def transform(self, X, y=None, *args, **kwargs):
        # replace categories by their assigned max age value
        for col in self.cols:
            X[col] = X[col].apply(lambda x: self.cat_mapping[col][x])
        return X
    
    def run(self, X):
        self.fit(X)
        self.transform(X)
        return X



In [6]:
# define a function to prepare the data for training and testing. The result is a dictionary containing:
    # 1: The training and testing data, without targets
    # 2: The training and testing targets
    # 3: The implementation set data
    # 4: The implementation set IMO numbers
    # 5: The cross-validation folds
def data_prep(df, K, predictors, response_col, fix_random_state=None):
    # filter out ships that are younger than 84 years
    df = df[df.age_in_months >= 84]
    # Split into data for model development and data for model implementation
    data_in_imp = df[df.implem_set == 1] #TODO - nothing currently in implementation set?? What to do???
    data_in_dev = df[df.implem_set == 0]
    # extract the IMO numbers from the implementation set
    implem_ref = data_in_imp.imo_pseudo #IMO
    # Select targets from development data
    targets = data_in_dev[response_col].reset_index(drop=True)
    # Select predictors from development data  and implementation set
    data_in_dev = data_in_dev[predictors].reset_index(drop=True)
    data_in_imp = data_in_imp[predictors]
    # create K-fold cross validation folds
    splitter = StratifiedKFold(n_splits=K, shuffle=True, random_state=fix_random_state)
    folds = splitter.split(data_in_dev, targets)
    folds = [[a,b] for a,b in folds]
    # create result dictionary
    data_prep_dict = {}
    data_prep_dict['X'] = data_in_dev
    data_prep_dict['Y'] = targets
    data_prep_dict['implem_set'] = data_in_imp
    data_prep_dict['implem_ref'] = implem_ref
    data_prep_dict['folds'] = folds
    
    return data_prep_dict

In [7]:
# define list of predictors to use
predictors = ["GSS_Type",
              "GSS_Propulsion",
              "GSS_Main.engines..Model",
              "GSS_Main.engines..Designer",
              "GSS_Main.engines..Builder.code",
              "GSS_Gross.tonnage",
              "GSS_Deadweight",
              "GSS_TEU",
              "GSS_Insulated.capacity",
              "GSS_Length.overall",
              "GSS_Length.between.perpendiculars",
              "GSS_Service.speed",
              "GSS_Main.engines..Number.of.main.engines",
              "GSS_Main.engines..Max..power",
              "age_in_months"]

# specify which predictors need to be transformed from categorical to numerical as the oldest age of a ship in with that model/designer/builder
cols_to_numeric = ["GSS_Main.engines..Model",
                   "GSS_Main.engines..Designer",
                   "GSS_Main.engines..Builder.code"]

# specify which predictors need to be one-hot-encoded - in this case, I am using cat to numeric here too
# HOW TO: pd.get_dummies(sloopschepen, columns=cols_to_encode).head() #  For one hot encoding - will avoid this for now, for amount of new columns created

cols_to_encode = ["GSS_Type", 
                 "GSS_Propulsion"]

In [8]:
# Read in the Data
sloopschepen =  pd.read_csv("sloopschepen_2016_2019.csv")

# convert columns to category that need to be transformed to numeric
sloopschepen = sloopschepen.astype({cols_to_encode[0]: 'category',
                                    cols_to_encode[1]: 'category'})

# now convert those same columns to their numerics
for column_convert in cols_to_encode:
    sloopschepen[column_convert+"_numeric"] = sloopschepen[column_convert].cat.codes

# Convert the columns that are aged by the categories in that column
sloopschepen = CatNumTransformer(cols=cols_to_numeric, max_classes=2).run(sloopschepen)

In [9]:
# Predictors after coversions
predictors_final = ["GSS_Type_numeric",
                  "GSS_Propulsion_numeric",
                  "GSS_Main.engines..Model",
                  "GSS_Main.engines..Designer",
                  "GSS_Main.engines..Builder.code",
                  "GSS_Gross.tonnage",
                  "GSS_Deadweight",
                  "GSS_TEU",
                  "GSS_Insulated.capacity",
                  "GSS_Length.overall",
                  "GSS_Length.between.perpendiculars",
                  "GSS_Service.speed",
                  "GSS_Main.engines..Number.of.main.engines",
                  "GSS_Main.engines..Max..power",
                  "age_in_months"]

In [14]:
import pickle
# set parameters
K = 5
state = 0

# prepare the data 
df = data_prep(df=sloopschepen,
               K=K,
               predictors=predictors_final,
               response_col="dismantled",
               fix_random_state=state)
# open a file, where you ant to store the data
file = open('data_folds_7_11', 'wb')

# dump information to that file
pickle.dump(df, file)

# close the file
file.close()

# len(df['folds'][0][0])
# df['X'].head()

In [15]:
# open a file, where you stored the pickled data
file = open('data_folds_7_11', 'rb')

# dump information to that file
df = pickle.load(file)

# close the file
file.close()

In [16]:
current_fold = 0
train_idxes = df['folds'][current_fold][0]
test_idxes = df['folds'][current_fold][1]

train_dataset_X = df['X'].loc[list(train_idxes), :]
train_dataset_Y = df['Y'].loc[list(train_idxes)]

test_dataset_X = df['X'].loc[list(test_idxes), :]
test_dataset_Y = df['Y'].loc[list(test_idxes)]
# train_dataset_X[]