# Solving a Kaggle Competition
- Here train and test files are seperately given
- Test data doesn't have target columns, so performance of trained model cannot be verified using test data
- For model verification purpose we need to split the training data into two parts (70% & 30%)
- There are two target variables
- Names of target columns -> [ 'premium', 'target']

# Plan of action
- Data preprocessing
- Normalization
- Outlier Handling
- Feature Selection
- Apply classifier models
- Fine tune the models using Grid-Search
- Check the results using evaluation parameters
- Model debugging
- Predict on user input

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('train_jRxnrHD.csv')

In [None]:
test = pd.read_csv('test_QaJU1Mh.csv')

In [None]:
train.shape, test.shape

((79853, 13), (34224, 11))

In [None]:
train.columns

Index(['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type',
       'premium', 'target'],
      dtype='object')

In [None]:
test.columns

Index(['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type'],
      dtype='object')

In [None]:
train.isna().sum()

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 97
Count_6-12_months_late                97
Count_more_than_12_months_late        97
application_underwriting_score      2974
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
premium                                0
target                                 0
dtype: int64

In [None]:
test.isna().sum()

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 31
Count_6-12_months_late                31
Count_more_than_12_months_late        31
application_underwriting_score      1323
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
dtype: int64

In [None]:
train.shape, test.shape

((79853, 13), (34224, 11))

In [None]:
# test data is missing two target columns so we used only train data

# X & Y Split
- Two target columns
- premium -> regression target
- target -> classification target

In [None]:
# not selecting 'premium' column bcz it regression & we are doing classification
X = train.drop(columns=['premium','target'])
Y = train['target']

In [None]:
X.shape , Y.shape

((79853, 11), (79853,))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=7,stratify=Y)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((55897, 11), (23956, 11), (55897,), (23956,))

In [None]:
X_train.describe()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid
count,55897.0,55897.0,55897.0,55897.0,55828.0,55828.0,55828.0,53805.0,55897.0
mean,57221.042865,0.314292,18828.199742,208246.4,0.248065,0.07713,0.060973,99.066065,10.860028
std,32922.409773,0.334794,5212.774049,477540.0,0.690367,0.423694,0.316107,0.737298,5.170526
min,2.0,0.0,7670.0,24030.0,0.0,0.0,0.0,91.9,2.0
25%,28703.0,0.034,14974.0,107640.0,0.0,0.0,0.0,98.81,7.0
50%,57396.0,0.167,18624.0,165660.0,0.0,0.0,0.0,99.21,10.0
75%,85635.0,0.54,22636.0,252060.0,0.0,0.0,0.0,99.54,14.0
max,114075.0,1.0,37602.0,90262600.0,13.0,17.0,11.0,99.89,60.0


In [None]:
print('count of unique values per column')
for col in X_train.columns:
  print(col,'->',X_train[col].nunique())

count of unique values per column
id -> 55897
perc_premium_paid_by_cash_credit -> 1001
age_in_days -> 823
Income -> 19701
Count_3-6_months_late -> 14
Count_6-12_months_late -> 14
Count_more_than_12_months_late -> 10
application_underwriting_score -> 619
no_of_premiums_paid -> 55
sourcing_channel -> 5
residence_area_type -> 2


In [None]:
X_train.dtypes

id                                    int64
perc_premium_paid_by_cash_credit    float64
age_in_days                           int64
Income                                int64
Count_3-6_months_late               float64
Count_6-12_months_late              float64
Count_more_than_12_months_late      float64
application_underwriting_score      float64
no_of_premiums_paid                   int64
sourcing_channel                     object
residence_area_type                  object
dtype: object

In [None]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

In [None]:
print(cat_cols)

Index(['sourcing_channel', 'residence_area_type'], dtype='object')


# Try to check if we can find ordinal or nominal columns
- after understanding unique values -> both are nominal

In [None]:
X_train['sourcing_channel'].unique()

array(['D', 'A', 'C', 'B', 'E'], dtype=object)

In [None]:
X_train['residence_area_type'].unique()

array(['Rural', 'Urban'], dtype=object)

In [None]:
X_train.isna().sum()

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 69
Count_6-12_months_late                69
Count_more_than_12_months_late        69
application_underwriting_score      2092
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
dtype: int64

# Remove columns / rows which have missing values more than threshold
- Assum threshold as 90%
so
- First find % of missing values in every column
- If % of missing values > 90% then drop the column

In [None]:
row_count = X_train.shape[0]

In [None]:
# percentage of null values in every column
X_train.isna().sum()/row_count * 100
# percentage is less than threshold so no need to drop

id                                  0.000000
perc_premium_paid_by_cash_credit    0.000000
age_in_days                         0.000000
Income                              0.000000
Count_3-6_months_late               0.123441
Count_6-12_months_late              0.123441
Count_more_than_12_months_late      0.123441
application_underwriting_score      3.742598
no_of_premiums_paid                 0.000000
sourcing_channel                    0.000000
residence_area_type                 0.000000
dtype: float64

# Remove columns which have single value (all values same)
also drop the id column

In [None]:
X_train.drop('id',axis=1,inplace=True)

In [None]:
X_train.nunique()
# no column will drop
# every column has more than 1 unique values

perc_premium_paid_by_cash_credit     1001
age_in_days                           823
Income                              19701
Count_3-6_months_late                  14
Count_6-12_months_late                 14
Count_more_than_12_months_late         10
application_underwriting_score        619
no_of_premiums_paid                    55
sourcing_channel                        5
residence_area_type                     2
dtype: int64

# Store list of valid columns

In [None]:
num_cols=X_train.select_dtypes(exclude='object').columns

In [None]:
valid_columns = X_train.columns

# Remove repeated rows

In [None]:
X_train.drop_duplicates(inplace=True)

In [None]:
X_train.shape
# no duplicate rows bcz size is same after dropping duplicates

(55897, 10)

# Handle (encode) the string columns and convert to numbers

In [None]:
X_train_ohe = pd.get_dummies(X_train,columns=cat_cols)

In [None]:
X_train.shape , X_train_ohe.shape

((55897, 10), (55897, 15))

In [None]:
X_train_ohe.columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel_A', 'sourcing_channel_B',
       'sourcing_channel_C', 'sourcing_channel_D', 'sourcing_channel_E',
       'residence_area_type_Rural', 'residence_area_type_Urban'],
      dtype='object')

# Fill missing values in all columns ( median )

In [None]:
fill_value = X_train_ohe.loc[:,num_cols].median()

In [None]:
fill_value

perc_premium_paid_by_cash_credit         0.167
age_in_days                          18624.000
Income                              165660.000
Count_3-6_months_late                    0.000
Count_6-12_months_late                   0.000
Count_more_than_12_months_late           0.000
application_underwriting_score          99.210
no_of_premiums_paid                     10.000
dtype: float64

In [None]:
# fill null values with calculated fill value
X_train_clean = X_train_ohe.fillna(fill_value)

In [None]:
X_train_clean.isna().sum()

perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
sourcing_channel_A                  0
sourcing_channel_B                  0
sourcing_channel_C                  0
sourcing_channel_D                  0
sourcing_channel_E                  0
residence_area_type_Rural           0
residence_area_type_Urban           0
dtype: int64

# Normalize / Standardize the data
- Use StandardScalar

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
sc.fit(X_train_clean.loc[:,num_cols])

In [None]:
X_train_clean.loc[:,num_cols] = sc.transform(X_train_clean.loc[:,num_cols])

In [None]:
X_train_clean.describe()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban
count,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0
mean,1.738953e-16,-1.016931e-16,-3.101641e-17,-2.949101e-17,-1.652514e-17,5.084657e-19,-1.37512e-14,2.4152120000000002e-17,0.541782,0.205521,0.150903,0.093905,0.00789,0.395012,0.604988
std,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,0.498256,0.404085,0.357957,0.291699,0.088473,0.488858,0.488858
min,-0.9387686,-2.140568,-0.3857647,-0.3590767,-0.1819266,-0.1927652,-9.906984,-1.713579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8372128,-0.7393826,-0.2106783,-0.3590767,-0.1819266,-0.1927652,-0.3335534,-0.7465512,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.4399504,-0.0391733,-0.08917958,-0.3590767,-0.1819266,-0.1927652,0.1913966,-0.1663343,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.6741763,0.7304814,0.09174928,-0.3590767,-0.1819266,-0.1927652,0.6196453,0.6072882,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.048166,3.601531,188.5814,18.48178,39.96568,34.62655,1.130781,9.503947,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Format the target / Transform the target ( if required )
- y_train only contains 0 or 1 values so no need of formatting the target columns

In [None]:
y_train.unique()

array([1, 0])

# Outlier handling
- we don't check outlier on test data

In [None]:
def outlier_treament_zscore(df , cont_columns):
    """
    This is a function for treatment of outliers
    In given data column by columns
    Here Z-score / Standard scaler technique is used
    Z_score = (x-mu)/sigma
    mu => mean of the column
    sigma => std_dev of a column
    Here we
    replace all outliers in every column one by one
    when value < -3 replace with -3 and value > 3 replace with 3
    """
    for col in cont_columns:
        df.loc[df[col] < -3 , col] = -3
        df.loc[df[col] > 3 , col] = 3
    return df

In [None]:
X_train_o = outlier_treament_zscore(X_train_clean,num_cols)

In [None]:
X_train_o.describe()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban
count,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0,55897.0
mean,1.738953e-16,-8.5e-05,-0.012877,-0.040107,-0.053002,-0.043462,0.028323,-0.010069,0.541782,0.205521,0.150903,0.093905,0.00789,0.395012,0.604988
std,1.000009,0.999739,0.33088,0.788794,0.573051,0.671664,0.85966,0.958847,0.498256,0.404085,0.357957,0.291699,0.088473,0.488858,0.488858
min,-0.9387686,-2.140568,-0.385765,-0.359077,-0.181927,-0.192765,-3.0,-1.713579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8372128,-0.739383,-0.210678,-0.359077,-0.181927,-0.192765,-0.333553,-0.746551,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.4399504,-0.039173,-0.08918,-0.359077,-0.181927,-0.192765,0.191397,-0.166334,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.6741763,0.730481,0.091749,-0.359077,-0.181927,-0.192765,0.619645,0.607288,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.048166,3.0,3.0,3.0,3.0,3.0,1.130781,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Apply all above steps on test data


# clean the test data as per the training data cleaning steps
- use valid columns
- one hot encode the columns
- standard scaler object
- fill value

In [None]:
valid_columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type'],
      dtype='object')

In [None]:
X_test = X_test.loc[:,valid_columns]

In [None]:
X_test.shape

(23956, 10)

In [None]:
X_test.dtypes

perc_premium_paid_by_cash_credit    float64
age_in_days                           int64
Income                                int64
Count_3-6_months_late               float64
Count_6-12_months_late              float64
Count_more_than_12_months_late      float64
application_underwriting_score      float64
no_of_premiums_paid                   int64
sourcing_channel                     object
residence_area_type                  object
dtype: object

In [None]:
X_test_ohe = pd.get_dummies(X_test,columns = cat_cols)

In [None]:
X_test_ohe.shape , X_train_ohe.shape
# shape of ohe should be same

((23956, 15), (55897, 15))

In [None]:
X_test_ohe.loc[:,num_cols] = sc.transform(X_test_ohe.loc[:,num_cols])

In [None]:
X_test_ohe.describe()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban
count,23956.0,23956.0,23956.0,23928.0,23928.0,23928.0,23074.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0
mean,-3.8e-05,0.011828,0.004193,0.003369,0.008559,-0.009943,-0.001799,0.002488,0.5364,0.209718,0.150442,0.096427,0.007013,0.400317,0.599683
std,1.001226,0.997385,1.127476,1.005873,1.098231,0.956812,1.030016,1.000131,0.498684,0.407116,0.357512,0.295182,0.08345,0.489973,0.489973
min,-0.938769,-2.140568,-0.385765,-0.359077,-0.181927,-0.192765,-9.824097,-1.713579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.837213,-0.738999,-0.209757,-0.359077,-0.181927,-0.192765,-0.347368,-0.746551,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.43995,-0.038406,-0.084416,-0.359077,-0.181927,-0.192765,0.205211,-0.166334,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.653268,0.730673,0.092786,-0.359077,-0.181927,-0.192765,0.647274,0.607288,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.048166,3.532086,112.271499,12.684593,35.242436,18.79959,1.130781,9.117136,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_test_clean = X_test_ohe.copy()

In [None]:
X_test_clean.loc[:,num_cols] = X_test_ohe.loc[:,num_cols].fillna(fill_value)

In [None]:
X_test_clean.isna().sum()

perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
sourcing_channel_A                  0
sourcing_channel_B                  0
sourcing_channel_C                  0
sourcing_channel_D                  0
sourcing_channel_E                  0
residence_area_type_Rural           0
residence_area_type_Urban           0
dtype: int64

In [None]:
X_test_clean.describe()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban
count,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0,23956.0
mean,-3.8e-05,0.011828,0.004193,0.003365,0.008549,-0.009931,3.650931,0.002488,0.5364,0.209718,0.150442,0.096427,0.007013,0.400317,0.599683
std,1.001226,0.997385,1.127476,1.005285,1.097589,0.956252,18.710637,1.000131,0.498684,0.407116,0.357512,0.295182,0.08345,0.489973,0.489973
min,-0.938769,-2.140568,-0.385765,-0.359077,-0.181927,-0.192765,-9.824097,-1.713579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.837213,-0.738999,-0.209757,-0.359077,-0.181927,-0.192765,-0.319739,-0.746551,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.43995,-0.038406,-0.084416,-0.359077,-0.181927,-0.192765,0.23284,-0.166334,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.653268,0.730673,0.092786,-0.359077,-0.181927,-0.192765,0.702532,0.607288,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.048166,3.532086,112.271499,12.684593,35.242436,18.79959,99.21,9.117136,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Training and testing data cleaning completed

# Feature selection of Training data


# which method for feature selection is best ?
- PCA
- RFE -> weak method (not preferred in industry)
- SelectFromModel (uses) -> Decision Tree

cannot say until we know the data,
there can be multiple best so apply atleast 2 & check which one is giving best performance

In [None]:
# we will be using PCA & SelectFromMode with DecisionTreeClassifier(bcz classification task)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.99,random_state=7)

In [None]:
pca.fit(X_train_o)

In [None]:
X_train_pca = pca.transform(X_train_o)

In [None]:
X_train_pca.shape

(55897, 12)

# SelectFromModel using DecisionTreeClassifier

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
sfm = SelectFromModel(DecisionTreeClassifier(random_state=7,
                                             class_weight='balanced'),threshold='mean')

In [None]:
sfm.fit(X_train_o,y_train)

In [None]:
sfm.get_feature_names_out()

array(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'application_underwriting_score', 'no_of_premiums_paid'],
      dtype=object)

In [None]:
X_train_sfm = sfm.transform(X_train_o)

In [None]:
X_train_sfm.shape

(55897, 5)

#Train the classifier model

check the performance of all
SVM,RF , XGboost, CatBoost

In [None]:
X_test_pca = pca.transform(X_test_clean)

In [None]:
X_train_pca.shape, X_test_pca.shape

((55897, 12), (23956, 12))

In [None]:
X_test_sfm = sfm.transform(X_test_clean)

In [None]:
X_train_sfm.shape, X_test_sfm.shape

((55897, 5), (23956, 5))

In [None]:
from sklearn.svm import SVC

In [None]:
svc_pca = SVC(random_state=7)

In [None]:
svc_pca.fit(X_train_pca,y_train)

In [None]:
svc_sfm = SVC(random_state=7)
svc_sfm.fit(X_train_sfm,y_train)

# Test data feature selection

In [None]:
X_test_pca = pca.transform(X_test_clean)

In [None]:
X_train_pca.shape, X_test_pca.shape

((55897, 12), (23956, 12))

In [None]:
X_test_sfm = sfm.transform(X_test_clean)

In [None]:
X_train_sfm.shape, X_test_sfm.shape

((55897, 5), (23956, 5))

# Evaluation of Model

In [None]:
y_pred = svc_pca.predict(X_test_pca)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.48      0.05      0.08      1500
           1       0.94      1.00      0.97     22456

    accuracy                           0.94     23956
   macro avg       0.71      0.52      0.53     23956
weighted avg       0.91      0.94      0.91     23956



# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200,random_state=7,oob_score=True,class_weight='balanced')

In [None]:
rf.fit(X_train_pca,y_train)

In [None]:
y_pred = rf.predict(X_test_pca)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.39      0.02      0.04      1500
           1       0.94      1.00      0.97     22456

    accuracy                           0.94     23956
   macro avg       0.66      0.51      0.50     23956
weighted avg       0.90      0.94      0.91     23956



# Parameter tuning to fine tune the model
- fine tune RF's parameters

In [None]:
# There can be multiple parameters which we can change and check the models performance

# Manual way can tedious and may involve too much programming

# Best solution -> GrifSearchCV

# Using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pg = { 'n_estimators' : [20,50,100],
      'max_depth' : [3,5,10],
       'ccp_alpha' : [0.2,0.5,1]}


In [None]:
gsv = GridSearchCV(RandomForestClassifier(random_state=7,
                                    oob_score = True,
                                    class_weight = 'balanced'),
             param_grid=pg,cv=2,verbose=2)

In [None]:
gsv.fit(X_train_pca,y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


  warn(


[CV] END ........ccp_alpha=0.2, max_depth=3, n_estimators=20; total time=   0.7s


  warn(


[CV] END ........ccp_alpha=0.2, max_depth=3, n_estimators=20; total time=   0.7s
[CV] END ........ccp_alpha=0.2, max_depth=3, n_estimators=50; total time=   1.8s
[CV] END ........ccp_alpha=0.2, max_depth=3, n_estimators=50; total time=   1.8s
[CV] END .......ccp_alpha=0.2, max_depth=3, n_estimators=100; total time=   4.3s
[CV] END .......ccp_alpha=0.2, max_depth=3, n_estimators=100; total time=   3.5s


  warn(


[CV] END ........ccp_alpha=0.2, max_depth=5, n_estimators=20; total time=   1.1s


  warn(


[CV] END ........ccp_alpha=0.2, max_depth=5, n_estimators=20; total time=   1.1s
[CV] END ........ccp_alpha=0.2, max_depth=5, n_estimators=50; total time=   2.6s
[CV] END ........ccp_alpha=0.2, max_depth=5, n_estimators=50; total time=   3.2s
[CV] END .......ccp_alpha=0.2, max_depth=5, n_estimators=100; total time=   5.3s
[CV] END .......ccp_alpha=0.2, max_depth=5, n_estimators=100; total time=   5.1s


  warn(


[CV] END .......ccp_alpha=0.2, max_depth=10, n_estimators=20; total time=   2.4s


  warn(


[CV] END .......ccp_alpha=0.2, max_depth=10, n_estimators=20; total time=   1.9s
[CV] END .......ccp_alpha=0.2, max_depth=10, n_estimators=50; total time=   5.3s
[CV] END .......ccp_alpha=0.2, max_depth=10, n_estimators=50; total time=   4.9s
[CV] END ......ccp_alpha=0.2, max_depth=10, n_estimators=100; total time=   9.0s
[CV] END ......ccp_alpha=0.2, max_depth=10, n_estimators=100; total time=   9.5s


  warn(


[CV] END ........ccp_alpha=0.5, max_depth=3, n_estimators=20; total time=   0.7s


  warn(


[CV] END ........ccp_alpha=0.5, max_depth=3, n_estimators=20; total time=   0.7s
[CV] END ........ccp_alpha=0.5, max_depth=3, n_estimators=50; total time=   1.8s
[CV] END ........ccp_alpha=0.5, max_depth=3, n_estimators=50; total time=   1.7s
[CV] END .......ccp_alpha=0.5, max_depth=3, n_estimators=100; total time=   4.2s
[CV] END .......ccp_alpha=0.5, max_depth=3, n_estimators=100; total time=   3.5s


  warn(


[CV] END ........ccp_alpha=0.5, max_depth=5, n_estimators=20; total time=   1.1s


  warn(


[CV] END ........ccp_alpha=0.5, max_depth=5, n_estimators=20; total time=   1.0s
[CV] END ........ccp_alpha=0.5, max_depth=5, n_estimators=50; total time=   2.6s
[CV] END ........ccp_alpha=0.5, max_depth=5, n_estimators=50; total time=   3.3s
[CV] END .......ccp_alpha=0.5, max_depth=5, n_estimators=100; total time=   5.1s
[CV] END .......ccp_alpha=0.5, max_depth=5, n_estimators=100; total time=   5.3s


  warn(


[CV] END .......ccp_alpha=0.5, max_depth=10, n_estimators=20; total time=   4.6s


  warn(


[CV] END .......ccp_alpha=0.5, max_depth=10, n_estimators=20; total time=   2.3s
[CV] END .......ccp_alpha=0.5, max_depth=10, n_estimators=50; total time=   4.3s
[CV] END .......ccp_alpha=0.5, max_depth=10, n_estimators=50; total time=   5.1s
[CV] END ......ccp_alpha=0.5, max_depth=10, n_estimators=100; total time=   8.8s
[CV] END ......ccp_alpha=0.5, max_depth=10, n_estimators=100; total time=   9.6s


  warn(


[CV] END ..........ccp_alpha=1, max_depth=3, n_estimators=20; total time=   0.7s


  warn(


[CV] END ..........ccp_alpha=1, max_depth=3, n_estimators=20; total time=   0.7s
[CV] END ..........ccp_alpha=1, max_depth=3, n_estimators=50; total time=   1.8s
[CV] END ..........ccp_alpha=1, max_depth=3, n_estimators=50; total time=   2.5s
[CV] END .........ccp_alpha=1, max_depth=3, n_estimators=100; total time=   4.4s
[CV] END .........ccp_alpha=1, max_depth=3, n_estimators=100; total time=   4.4s


  warn(


[CV] END ..........ccp_alpha=1, max_depth=5, n_estimators=20; total time=   1.0s


  warn(


[CV] END ..........ccp_alpha=1, max_depth=5, n_estimators=20; total time=   1.3s
[CV] END ..........ccp_alpha=1, max_depth=5, n_estimators=50; total time=   3.1s
[CV] END ..........ccp_alpha=1, max_depth=5, n_estimators=50; total time=   2.6s
[CV] END .........ccp_alpha=1, max_depth=5, n_estimators=100; total time=   5.1s
[CV] END .........ccp_alpha=1, max_depth=5, n_estimators=100; total time=   5.9s


  warn(


[CV] END .........ccp_alpha=1, max_depth=10, n_estimators=20; total time=   1.8s


  warn(


[CV] END .........ccp_alpha=1, max_depth=10, n_estimators=20; total time=   1.8s
[CV] END .........ccp_alpha=1, max_depth=10, n_estimators=50; total time=   4.3s
[CV] END .........ccp_alpha=1, max_depth=10, n_estimators=50; total time=   5.1s
[CV] END ........ccp_alpha=1, max_depth=10, n_estimators=100; total time=   9.0s
[CV] END ........ccp_alpha=1, max_depth=10, n_estimators=100; total time=   9.1s


  warn(


In [None]:
gsv.best_params_

{'ccp_alpha': 0.2, 'max_depth': 3, 'n_estimators': 20}