In [150]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.svm import SVC

In [151]:
df = pd.read_csv('train.csv')

In [152]:
y = df['Class']
X = df.drop(['Class', 'Id'], axis=1)
# categorical_features = ['EJ']
# numerical_features = ['AB','AF','AH','AM','AR','AX','AY','AZ','BC','BD','BN','BP','BQ',
#                       'BR','BZ','CB','CC','CD','CF','CH','CL','CR','CS','CU','CW','DA',
#                       'DE','DF','DH','DI','DL','DN','DU','DV','DY','EB','EE','EG','EH',
#                       'EJ','EL','EP','EU','FC','FD','FE','FI','FL','FR','FS','GB','GE',
#                       'GF','GH','GI','GL']
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

In [153]:
RANDOM_STATE=42

In [154]:
# for c in categorical_features:
#     if df[c].isnull().any():
#         print(c)

In [155]:
cat_pipe = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        # ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

In [156]:
num_pipe = Pipeline([
    ('num_imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

In [157]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numerical_ix),
        ("cat", cat_pipe, categorical_ix),
    ]
)

In [158]:
# imputed_res = num_pipe.fit_transform(numerical_features)

# scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [159]:
model = RandomForestClassifier(n_estimators=1000, random_state=RANDOM_STATE, n_jobs=-1)
# model = LogisticRegression(class_weight="balanced", n_jobs=-1)
# model = SVC(class_weight='balanced', random_state=RANDOM_STATE)

In [160]:
estimator = Pipeline([
    ("preprocessor", preprocessor),
    ('model', model)
])

In [161]:
cv = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(estimator, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scores = absolute(scores)
# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

MAE: 0.076 (0.038)


In [None]:
# RandomForestClassifier(n_estimators=1000, random_state=RANDOM_STATE, n_jobs=-1)
# MAE: 0.076 (0.037)

# model = RandomForestClassifier(n_estimators=1000, class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1)
# MAE: 0.096 (0.035)

# model = SVC()
# MAE: 0.117 (0.028)

# model = SVC(class_weight='balanced')
# MAE: 0.109 (0.040)


In [128]:
scores

array([0.20967742, 0.27419355, 0.22580645, 0.20967742, 0.17741935,
       0.16129032, 0.29032258, 0.14754098, 0.2295082 , 0.3442623 ])

In [124]:
estimator.get_params().keys()


dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'model', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__num_imputer', 'preprocessor__num__standard_scale', 'preprocessor__num__num_imputer__add_indicator', 'preprocessor__num__num_imputer__copy', 'preprocessor__num__num_imputer__fill_value', 'preprocessor__num__num_imputer__keep_empty_features', 'preprocessor__num__num_imputer__missing_values', 'preprocessor__num__num_imputer__strategy', 'preprocessor__num__num_imputer__verbose', 'preprocessor__num__standard_scale__copy', 'preprocessor__num__standard_scale__with_mean', 'preprocessor__num__standard_scale__with_std', 'preprocessor__cat__memory', 'preprocessor__cat__

In [17]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#   'model__n_estimators':[800,900,1000,1100],
#   'model__min_samples_split': [2,3,4]
#   }

# grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)
# grid.fit(X, y)

In [18]:
# grid.best_params_

In [19]:
# import Grid Search class
# from sklearn.model_selection import GridSearchCV
# # make lists of different parameters to check
# parameters = {
#   'n_estimators':[1,10,100,1000],
#   'min_samples_split': [2,3,4,5]
#   }
# #initialize
# grid_pipeline = GridSearchCV(estimator, parameters)
# # fit
# grid_pipeline.fit(X,y)
# grid_pipeline.best_params_

In [20]:
from sklearn.model_selection import cross_validate
def cross_validation(model, _X, _y, _cv=5):
      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      return [
          results['test_accuracy'].mean(), results['test_precision'].mean(), results['test_recall'].mean(), results['test_f1'].mean()
      ] 

     #  return {"Training Accuracy scores": results['train_accuracy'],
     #          "Mean Training Accuracy": results['train_accuracy'].mean()*100,
     #          "Training Precision scores": results['train_precision'],
     #          "Mean Training Precision": results['train_precision'].mean(),
     #          "Training Recall scores": results['train_recall'],
     #          "Mean Training Recall": results['train_recall'].mean(),
     #          "Training F1 scores": results['train_f1'],
     #          "Mean Training F1 Score": results['train_f1'].mean(),
     #          "Validation Accuracy scores": results['test_accuracy'],
     #          "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
     #          "Validation Precision scores": results['test_precision'],
     #          "Mean Validation Precision": results['test_precision'].mean(),
     #          "Validation Recall scores": results['test_recall'],
     #          "Mean Validation Recall": results['test_recall'].mean(),
     #          "Validation F1 scores": results['test_f1'],
     #          "Mean Validation F1 Score": results['test_f1'].mean()
     #          }

In [21]:
# cross_validation(estimator, X, y)
# estimator.score()

NameError: name 'X' is not defined

In [175]:
# balanced random forest
# {'Training Accuracy scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Accuracy': 100.0,
#  'Training Precision scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Precision': 1.0,
#  'Training Recall scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Recall': 1.0,
#  'Training F1 scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training F1 Score': 1.0,
#  'Validation Accuracy scores': array([0.90322581, 0.87903226, 0.89430894, 0.86178862, 0.93495935]),
#  'Mean Validation Accuracy': 89.46629950170471,
#  'Validation Precision scores': array([0.91666667, 0.76923077, 0.90909091, 0.7       , 0.93333333]),
#  'Mean Validation Precision': 0.8456643356643356,
#  'Validation Recall scores': array([0.5       , 0.45454545, 0.45454545, 0.33333333, 0.66666667]),
#  'Mean Validation Recall': 0.4818181818181818,
#  'Validation F1 scores': array([0.64705882, 0.57142857, 0.60606061, 0.4516129 , 0.77777778]),
#  'Mean Validation F1 Score': 0.6107877364044347}

In [176]:
# random forest
# {'Training Accuracy scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Accuracy': 100.0,
#  'Training Precision scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Precision': 1.0,
#  'Training Recall scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training Recall': 1.0,
#  'Training F1 scores': array([1., 1., 1., 1., 1.]),
#  'Mean Training F1 Score': 1.0,
#  'Validation Accuracy scores': array([0.90322581, 0.89516129, 0.91869919, 0.89430894, 0.95121951]),
#  'Mean Validation Accuracy': 91.25229478101232,
#  'Validation Precision scores': array([1.        , 0.69565217, 0.83333333, 0.78571429, 0.89473684]),
#  'Mean Validation Precision': 0.841887327013185,
#  'Validation Recall scores': array([0.45454545, 0.72727273, 0.68181818, 0.52380952, 0.80952381]),
#  'Mean Validation Recall': 0.6393939393939394,
#  'Validation F1 scores': array([0.625     , 0.71111111, 0.75      , 0.62857143, 0.85      ]),
#  'Mean Validation F1 Score': 0.7129365079365079}

In [177]:
# logistic regression
# {'Training Accuracy scores': array([0.94726166, 0.94117647, 0.93927126, 0.95951417, 0.93927126]),
#  'Mean Training Accuracy': 94.52989628072366,
#  'Training Precision scores': array([0.89473684, 0.87012987, 0.88888889, 0.93506494, 0.89041096]),
#  'Mean Training Precision': 0.8958462990186133,
#  'Training Recall scores': array([0.79069767, 0.77906977, 0.74418605, 0.82758621, 0.74712644]),
#  'Mean Training Recall': 0.7777332264100508,
#  'Training F1 scores': array([0.83950617, 0.82208589, 0.81012658, 0.87804878, 0.8125    ]),
#  'Mean Training F1 Score': 0.8324534850352687,
#  'Validation Accuracy scores': array([0.85483871, 0.91129032, 0.88617886, 0.85365854, 0.89430894]),
#  'Mean Validation Accuracy': 88.00550747442959,
#  'Validation Precision scores': array([0.625     , 0.82352941, 0.72222222, 0.58823529, 0.7       ]),
#  'Mean Validation Precision': 0.691797385620915,
#  'Validation Recall scores': array([0.45454545, 0.63636364, 0.59090909, 0.47619048, 0.66666667]),
#  'Mean Validation Recall': 0.5649350649350648,
#  'Validation F1 scores': array([0.52631579, 0.71794872, 0.65      , 0.52631579, 0.68292683]),
#  'Mean Validation F1 Score': 0.6207014252328757}

In [None]:
# balanced logistic regression
# {'Training Accuracy scores': array([0.9168357 , 0.92697769, 0.93319838, 0.93927126, 0.92510121]),
#  'Mean Training Accuracy': 92.82768475252729,
#  'Training Precision scores': array([0.69230769, 0.72321429, 0.73873874, 0.76146789, 0.71186441]),
#  'Mean Training Precision': 0.7255186026897269,
#  'Training Recall scores': array([0.94186047, 0.94186047, 0.95348837, 0.95402299, 0.96551724]),
#  'Mean Training Recall': 0.9513499064421278,
#  'Training F1 scores': array([0.79802956, 0.81818182, 0.83248731, 0.84693878, 0.8195122 ]),
#  'Mean Training F1 Score': 0.8230299310217779,
#  'Validation Accuracy scores': array([0.85483871, 0.87903226, 0.84552846, 0.86178862, 0.8699187 ]),
#  'Mean Validation Accuracy': 86.22213480199319,
#  'Validation Precision scores': array([0.57142857, 0.62962963, 0.55172414, 0.57142857, 0.58064516]),
#  'Mean Validation Precision': 0.5809712143416259,
#  'Validation Recall scores': array([0.72727273, 0.77272727, 0.72727273, 0.76190476, 0.85714286]),
#  'Mean Validation Recall': 0.7692640692640693,
#  'Validation F1 scores': array([0.64      , 0.69387755, 0.62745098, 0.65306122, 0.69230769]),
#  'Mean Validation F1 Score': 0.6613394896420106}