In [195]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [196]:
def features_heat_scatter(clf, param_grid, n_iter, X_train, y_train, X_test):
    # Create a random search object
    rand_search = RandomizedSearchCV(
        estimator = clf,
        param_distributions = param_grid,
        n_iter = n_iter,
        scoring='roc_auc', 
        n_jobs=4, 
        cv = 5, 
        refit=True, 
        return_train_score = True
)

    # Fit to the training data
    rand_search.fit(X_train, y_train).predict(X_test)

    # Create dataframe of random grid search results
    params = list(param_grid.keys())
    heatdf = pd.DataFrame()
    for p in params:
        heatdf[p] = rand_search.cv_results_['param_' + p].tolist()

    heatdf['score'] = rand_search.cv_results_['mean_test_score'].tolist()
    heatdf.dropna(inplace=True)

    variables = list(range(0, len(params)))

    if variables > 2:
        dependent = []
        independent = []

        for v in variables:
            idx = 0
            while idx <= max(variables):
                if v < idx:
                    dependent.append(params[v])
                    independent.append(params[idx])
                    idx += 1
                else:
                    idx += 1

        plot_inputs = pd.DataFrame({"input":dependent,"output":independent})

            # Subplots are organized in a Rows x Cols Grid
        Tot = plot_inputs.shape[0]
        Cols = np.ceil(Tot/4).astype(int)

        # Compute Rows required
        Rows = Tot // Cols 
        Rows += Tot % Cols

        # Create a Position index
        Position = range(1,Tot + 1)

        cmap = plt.get_cmap("Spectral")
        norm = plt.Normalize(heatdf['score'].min(), heatdf['score'].max())

        plt.rc('font', size=12)
        fig, axs = plt.subplots(Rows, Cols, figsize=(14, 10))

        for i, r in plot_inputs.iterrows():
            # add every single subplot to the figure with a for loop
            # ax = fig.add_subplot(Rows,Cols,Position[i])
            axs.ravel()[i].set(xlabel=r['input'], ylabel=r['output'])
            axs.ravel()[i].scatter(
                x = heatdf[r['input']], 
                y = heatdf[r['output']],
                c = heatdf['score'],
                cmap = 'gist_rainbow'
            )      
            axs.ravel()[i].set(xlabel=r['input'], ylabel=r['output'])

        sm =  ScalarMappable(norm=norm, cmap=cmap)
        sm.set_array([])
        cbar = fig.colorbar(sm, ax=axs)
        cbar.ax.set_title("scale")
        plt.show()
    else:
        plt.rc('font', size=16)
        plt.figure(figsize = (12,6))
        plt.scatter(
            x = heatdf[r['input']], 
            y = heatdf[r['output']],
            c = heatdf['score'],
            cmap = 'gist_rainbow'
        )
        plt.xlabel(r['input'])
        plt.ylabel(r['output'])
        plt.colorbar()
        plt.show()

In [197]:
# Load the training data
train = pd.read_csv('data/train.csv')

# Break up stacked features
train[['Group','GroupNumber']]=train.PassengerId.str.split('_',expand=True)
train.drop('PassengerId', axis=1, inplace=True)

# train[['Deck','CabinNumber', 'Side']]=train.Cabin.str.split('/',expand=True)
train.drop('Cabin', axis=1, inplace=True)

# # Convert boolean to numbers
train.CryoSleep = train.CryoSleep*1
train.VIP = train.VIP*1
train.Transported = train.Transported*1

# Drop useless features
train.drop('Name', axis=1, inplace=True)
train.drop('Destination', axis = 1, inplace = True)
train.drop('HomePlanet', axis = 1, inplace = True)
# train.drop('CryoSleep', axis = 1, inplace = True)
# train.drop('VIP', axis = 1, inplace = True)

# Fill NA money columns with 0
train[['RoomService','FoodCourt','ShoppingMall','Spa', 'VRDeck', 'CryoSleep', 'VIP']] = \
    train[['RoomService','FoodCourt','ShoppingMall','Spa', 'VRDeck', 'CryoSleep', 'VIP']].fillna(value=0)

# Makeup some new features
train['TotalSpent'] = train['RoomService'] + train['FoodCourt'] + \
    train['ShoppingMall'] + train['Spa'] + train['VRDeck']

# Sneak Peak!
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,GroupNumber,TotalSpent
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,1,736.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,1,10383.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,2,5176.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,1,1091.0


In [198]:
# What data is missing?
train.isna().sum()

CryoSleep         0
Age             179
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
Group             0
GroupNumber       0
TotalSpent        0
dtype: int64

In [199]:
# Dump rows of missing data interpolation or filling doesn't make sense
train.dropna(inplace=True)

In [200]:
# Correct data types
train = train.astype({
    "Group": int, 
    "GroupNumber": int,
    "CryoSleep":int,
    "VIP":int,
    "Transported":int
})

# Create some dummies
train = pd.get_dummies(train, drop_first = True)

train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,GroupNumber,TotalSpent
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,1,736.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,1,10383.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,2,5176.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,1,1091.0


In [201]:
# Create features and predictor
X = train.drop('Transported', axis=1)
y = train['Transported']

# Traing, test, and split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

# Create classifiers
lr = LogisticRegression(max_iter = 500)
gnb = GaussianNB()
ada = AdaBoostClassifier()
grb = GradientBoostingClassifier()
rfc = RandomForestClassifier()
xgbc = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric = 'logloss')

clf_list = [
    (lr, "Logistic"),
    (gnb, "NaiveBayes"),
    (grb, 'GradientBoost'),
    (ada, 'AdaBoost'),
    (rfc, "RandomForest"),
    (xgbc, 'XGBoost')
]

s = []
m = []
for i, (clf, name) in enumerate(clf_list):
    p = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
    s.append(accuracy_score(y_test, p.round()))
    m.append(name)

modelSelection = pd.DataFrame(m, columns=['Model'])
modelSelection['Score'] = s
modelSelection.sort_values("Score", inplace=True, ascending=False)

modelSelection.head()

Unnamed: 0,Model,Score
0,Logistic,0.812683
2,GradientBoost,0.804463
3,AdaBoost,0.801527
5,XGBoost,0.792132
4,RandomForest,0.773341


In [202]:
# Create a Random Forest Classifier with specified criterion
clf = LogisticRegression(solver = 'saga', penalty='elasticnet')

# Create the parameter grid
param_grid = {
    'C': np.unique(np.linspace(0, 2, 21).round(1)), 
    'max_iter': list(range(500,1500)),
    'l1_ratio': np.unique(np.linspace(0, 1, 11).round(1))
} 

features_heat_scatter(clf, param_grid, 1000, X_train, y_train, X_test)


190 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
190 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1589, in fit
    fold_coefs_ = Parallel(
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\andre\AppData\Local\Programs\Python\Py

TypeError: '>' not supported between instances of 'list' and 'int'

In [203]:
features_heat_scatter(clf, param_grid, 1000, X_train, y_train, X_test)



KeyboardInterrupt: 

In [None]:
# Create a Random Forest Classifier with specified criterion
clf = RandomForestClassifier(criterion='entropy')

# Create the parameter grid
param_grid = {
    'max_depth': list(range(20,100)), 
    'min_samples_leaf': list(range(1,20)),
    'n_estimators':list(range(100, 500))
} 

features_heat_scatter(clf, param_grid, 1000, X_train, y_train, X_test)

In [None]:
test = pd.read_csv('data/test.csv')