AJ Notebook: Tanzania water problem
Your goal is to predict the operating condition of a waterpoint for each record in the dataset. You are provided the following set of information about the waterpoints


In [16]:
#imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [17]:
labels = pd.read_csv('Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv')
#dataset with the ys of training data
df = pd.read_csv('Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv')
#dataset with training set


In [18]:
df.shape #comparing shapes to see how it was divided.

(59400, 40)

In [19]:
labels.shape #looking at the y

(59400, 2)

Based on these 2 shapes you can see that the data was originally 74,250 rows and was divided into: 20% testing data, 80% training

In [20]:
labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [21]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [22]:
df.loc[df['id'] != labels['id']] #always equal

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group


In [23]:
full_df = df.merge(labels)
full_df.shape

(59400, 41)

In [24]:
full_df.loc[full_df['id'] == 69572] #checking accurate merge with a few things

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional


In [25]:
full_df['status_group'].value_counts() #looking at the classes of the y

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

Changing the y classes to be 2 classes instead of three. Grouping by functional and non functional. Functional but needs repair now going to be functional. 

In [26]:
full_df['status'] = np.where(full_df['status_group'] == "functional needs repair", 
                                'functional', full_df['status_group'])

In [27]:
full_df['status'].value_counts() #checking the column grouped correctly
#no class imbalance currently

functional        36576
non functional    22824
Name: status, dtype: int64

In [28]:
#creating binary grouping
full_df['binary_status'] = np.where(full_df['status'] == 'functional', 1, 0)

In [29]:
full_df['binary_status'].value_counts()

1    36576
0    22824
Name: binary_status, dtype: int64

In [30]:
full_df.isna().sum() #noting lots of NAs in some of these columns

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [31]:
full_df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,status,binary_status
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,functional,1
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,functional,1
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,functional,1
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,non functional,0
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,functional,1


In [32]:
full_df['scheme_management'].value_counts() #going to drop this since lots of other groupings and "company" or private operater

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [33]:

clean_df = full_df.drop(['status', 'status_group', 'waterpoint_type', 'source', 'payment_type', 
              'date_recorded', 'water_quality', 'quantity', 'num_private', 
                         'management_group', 'extraction_type_group', 
                         'extraction_type', 'recorded_by', 'latitude',
                        'longitude', 'lga', 'ward', 'scheme_name',
                        'region', 'subvillage', 'basin', 'wpt_name', 'public_meeting',
                        'gps_height', 'scheme_management'], axis = 1)

In [34]:
clean_df.head(-10)

Unnamed: 0,id,amount_tsh,funder,installer,region_code,district_code,population,permit,construction_year,extraction_type_class,management,payment,quality_group,quantity_group,source_type,source_class,waterpoint_type_group,binary_status
0,69572,6000.0,Roman,Roman,11,5,109,False,1999,gravity,vwc,pay annually,good,enough,spring,groundwater,communal standpipe,1
1,8776,0.0,Grumeti,GRUMETI,20,2,280,True,2010,gravity,wug,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,1
2,34310,25.0,Lottery Club,World vision,21,4,250,True,2009,gravity,vwc,pay per bucket,good,enough,dam,surface,communal standpipe,1
3,67743,0.0,Unicef,UNICEF,90,63,58,True,1986,submersible,vwc,never pay,good,dry,borehole,groundwater,communal standpipe,0
4,19728,0.0,Action In A,Artisan,18,1,0,True,0,gravity,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59385,34473,500.0,Jaica,JAICA CO,20,4,200,True,2011,handpump,wug,pay monthly,salty,enough,borehole,groundwater,hand pump,1
59386,34952,0.0,Adb,DWE,15,2,1000,False,2009,handpump,vwc,never pay,good,enough,borehole,groundwater,hand pump,1
59387,26640,100.0,0,0,7,2,100,False,2000,submersible,wua,pay per bucket,good,enough,borehole,groundwater,communal standpipe,1
59388,72559,0.0,Kidep,DWE,16,1,500,True,1995,gravity,vwc,unknown,good,insufficient,spring,groundwater,improved spring,1


In [35]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   funder                 55765 non-null  object 
 3   installer              55745 non-null  object 
 4   region_code            59400 non-null  int64  
 5   district_code          59400 non-null  int64  
 6   population             59400 non-null  int64  
 7   permit                 56344 non-null  object 
 8   construction_year      59400 non-null  int64  
 9   extraction_type_class  59400 non-null  object 
 10  management             59400 non-null  object 
 11  payment                59400 non-null  object 
 12  quality_group          59400 non-null  object 
 13  quantity_group         59400 non-null  object 
 14  source_type            59400 non-null  object 
 15  so

In [36]:
clean_df['region_code']= clean_df['region_code'].astype(str)
clean_df['district_code'] = clean_df['district_code'].astype(str)
clean_df['construction_year'] = clean_df['construction_year'].astype(str)
clean_df['amount_tsh'] = clean_df['amount_tsh'].astype(int)
clean_df['permit'] = np.where(clean_df['permit'] == True, 1, clean_df['permit'])
clean_df['permit'] = np.where(clean_df['permit'] == False, 0, clean_df['permit'])
clean_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     59400 non-null  int64 
 1   amount_tsh             59400 non-null  int64 
 2   funder                 55765 non-null  object
 3   installer              55745 non-null  object
 4   region_code            59400 non-null  object
 5   district_code          59400 non-null  object
 6   population             59400 non-null  int64 
 7   permit                 56344 non-null  object
 8   construction_year      59400 non-null  object
 9   extraction_type_class  59400 non-null  object
 10  management             59400 non-null  object
 11  payment                59400 non-null  object
 12  quality_group          59400 non-null  object
 13  quantity_group         59400 non-null  object
 14  source_type            59400 non-null  object
 15  source_class       

In [37]:
clean_df['permit'].value_counts()

1    38852
0    17492
Name: permit, dtype: int64

In [38]:
clean_df.info() #going to drop rows without waterpoint permit information

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     59400 non-null  int64 
 1   amount_tsh             59400 non-null  int64 
 2   funder                 55765 non-null  object
 3   installer              55745 non-null  object
 4   region_code            59400 non-null  object
 5   district_code          59400 non-null  object
 6   population             59400 non-null  int64 
 7   permit                 56344 non-null  object
 8   construction_year      59400 non-null  object
 9   extraction_type_class  59400 non-null  object
 10  management             59400 non-null  object
 11  payment                59400 non-null  object
 12  quality_group          59400 non-null  object
 13  quantity_group         59400 non-null  object
 14  source_type            59400 non-null  object
 15  source_class       

In [39]:
clean_df.loc[((clean_df['permit'] != 0) &
             (clean_df['permit'] != 1))]


Unnamed: 0,id,amount_tsh,funder,installer,region_code,district_code,population,permit,construction_year,extraction_type_class,management,payment,quality_group,quantity_group,source_type,source_class,waterpoint_type_group,binary_status
43,19282,0,,,13,2,1,,1980,motorpump,vwc,unknown,unknown,dry,borehole,groundwater,communal standpipe,0
47,13620,0,,,12,4,0,,0,gravity,vwc,never pay,good,enough,spring,groundwater,communal standpipe,1
65,51072,0,,,13,2,1,,1970,motorpump,vwc,unknown,unknown,dry,borehole,groundwater,communal standpipe,0
109,7116,0,,,12,4,0,,0,other,vwc,never pay,good,enough,river/lake,surface,communal standpipe,1
118,10837,0,Hesawa,Hesawa,20,2,600,,1997,gravity,other,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59339,1209,0,,,12,4,0,,0,gravity,vwc,never pay,good,insufficient,spring,groundwater,communal standpipe,1
59344,6450,0,,,2,7,230,,0,gravity,unknown,unknown,unknown,unknown,spring,groundwater,communal standpipe,1
59357,46563,0,,,13,2,1,,1980,handpump,vwc,unknown,unknown,dry,shallow well,groundwater,hand pump,0
59366,55232,0,,,13,2,200,,2000,handpump,vwc,never pay,good,insufficient,shallow well,groundwater,hand pump,1


In [40]:
clean_df.drop(clean_df[(clean_df['permit'] != 0) & 
                       (clean_df['permit'] != 1)].index, inplace=True)
clean_df['permit'] = clean_df['permit'].astype(int)

In [41]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56344 entries, 0 to 59399
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     56344 non-null  int64 
 1   amount_tsh             56344 non-null  int64 
 2   funder                 55127 non-null  object
 3   installer              55102 non-null  object
 4   region_code            56344 non-null  object
 5   district_code          56344 non-null  object
 6   population             56344 non-null  int64 
 7   permit                 56344 non-null  int64 
 8   construction_year      56344 non-null  object
 9   extraction_type_class  56344 non-null  object
 10  management             56344 non-null  object
 11  payment                56344 non-null  object
 12  quality_group          56344 non-null  object
 13  quantity_group         56344 non-null  object
 14  source_type            56344 non-null  object
 15  source_class       

In [42]:
clean_df['funder'].value_counts()

Government Of Tanzania    9043
Danida                    3112
Hesawa                    2027
Rwssp                     1372
World Bank                1345
                          ... 
Tasf                         1
Padri K                      1
Kdc                          1
Kurrp Ki                     1
Dagida                       1
Name: funder, Length: 1835, dtype: int64

In [43]:
clean_df['installer'].value_counts()

DWE                   17361
Government             1788
RWE                    1203
Commu                  1060
DANIDA                 1049
                      ...  
Yakwetu Contractor        1
Diwani                    1
Natio                     1
WASHIMA                   1
JUINE CO                  1
Name: installer, Length: 2056, dtype: int64

In [44]:
clean_df.loc[clean_df['installer'] == '-'] #noticing construction year 0

Unnamed: 0,id,amount_tsh,funder,installer,region_code,district_code,population,permit,construction_year,extraction_type_class,management,payment,quality_group,quantity_group,source_type,source_class,waterpoint_type_group,binary_status
10217,42616,0,Kalebejo Parish,-,19,5,0,1,0,handpump,private operator,never pay,salty,insufficient,shallow well,groundwater,hand pump,1
20968,10873,0,Government Of Tanzania,-,19,5,0,1,0,other,vwc,pay per bucket,good,enough,river/lake,surface,communal standpipe,0
25769,21336,0,Government Of Tanzania,-,19,5,0,1,0,other,vwc,pay per bucket,good,enough,river/lake,surface,communal standpipe,1


In [45]:
clean_df['construction_year'].value_counts() #dropping this column since most years are blank! 

0       19580
2008     2576
2009     2491
2010     2430
2000     1566
2007     1559
2006     1447
2003     1276
2011     1211
2004     1109
2002     1065
1978     1027
2012     1026
2005      985
1995      979
1999      954
1985      943
1998      924
1984      779
1996      768
1982      741
1972      705
1994      703
1974      676
1990      667
1980      647
1992      632
1997      613
1993      595
2001      533
1988      521
1983      487
1975      437
1986      433
1976      411
1991      323
1989      316
1970      310
1987      301
1981      238
1977      199
1979      192
1973      183
2013      173
1971      145
1967       86
1963       85
1968       68
1969       59
1960       45
1964       40
1962       29
1961       20
1965       19
1966       17
Name: construction_year, dtype: int64

In [46]:
clean_df.drop(['construction_year'], axis = 1, inplace = True)

In [47]:
clean_df['source_type'].value_counts() #keeping this. Pretty clean! 

shallow well            16253
spring                  15981
borehole                11162
river/lake              10013
rainwater harvesting     2039
dam                       630
other                     266
Name: source_type, dtype: int64

In [48]:
clean_df['extraction_type_class'].value_counts() #keeping this. Pretty clean! 

gravity         25234
handpump        16048
other            6050
submersible      5854
motorpump        2704
rope pump         349
wind-powered      105
Name: extraction_type_class, dtype: int64

In [49]:
clean_df['permit'].value_counts() #1 means has permit now

1    38852
0    17492
Name: permit, dtype: int64

In [50]:
clean_df['management'].value_counts()

vwc                 38296
wug                  6340
water board          2830
wua                  2468
private operator     1893
parastatal           1595
water authority       825
other                 744
company               658
unknown               519
other - school         99
trust                  77
Name: management, dtype: int64

In [51]:
clean_df['payment'].value_counts() #what to do with unknown? bucket here? do we care about this? 

never pay                23776
pay per bucket            8700
pay monthly               8221
unknown                   7290
pay when scheme fails     3777
pay annually              3585
other                      995
Name: payment, dtype: int64

In [52]:
clean_df.drop(['payment'], axis=1, inplace = True)

In [53]:
clean_df['amount_tsh'].value_counts() #what does 0 mean here? drop? 

0         39015
500        3075
50         2340
1000       1445
20         1430
          ...  
138000        1
900           1
8500          1
60000         1
9             1
Name: amount_tsh, Length: 93, dtype: int64

In [54]:
clean_df.info() #need to find nulls for funder and installer

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56344 entries, 0 to 59399
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     56344 non-null  int64 
 1   amount_tsh             56344 non-null  int64 
 2   funder                 55127 non-null  object
 3   installer              55102 non-null  object
 4   region_code            56344 non-null  object
 5   district_code          56344 non-null  object
 6   population             56344 non-null  int64 
 7   permit                 56344 non-null  int64 
 8   extraction_type_class  56344 non-null  object
 9   management             56344 non-null  object
 10  quality_group          56344 non-null  object
 11  quantity_group         56344 non-null  object
 12  source_type            56344 non-null  object
 13  source_class           56344 non-null  object
 14  waterpoint_type_group  56344 non-null  object
 15  binary_status      

In [55]:
clean_df['quality_group'].value_counts() #what to do with unknown

good        48416
salty        5035
unknown      1399
milky         801
colored       490
fluoride      203
Name: quality_group, dtype: int64

In [56]:
clean_df['quantity_group'].value_counts()

enough          31979
insufficient    13934
dry              5836
seasonal         3901
unknown           694
Name: quantity_group, dtype: int64

going to drop rows with unknown quantity or quality since believe they would be important indicators in health of well. Maybe can do a ranking of quantity and quality to to scale them. Assign to numbers? 

In [71]:
clean_df.drop(clean_df[(clean_df['quantity_group'] == 'unknown') |
             (clean_df['quality_group'] == 'unknown')].index, inplace=True)
clean_df.shape


(54744, 16)

In [72]:
clean_df['quantity_group'].value_counts()

enough          31851
insufficient    13830
dry              5202
seasonal         3861
Name: quantity_group, dtype: int64

In [73]:
clean_df['quality_group'].value_counts()

good        48246
salty        5007
milky         799
colored       489
fluoride      203
Name: quality_group, dtype: int64

In [None]:
clean_df.describe()

In [None]:
clean_df.corr()

Creating train test split on data

In [None]:
X = clean_df.drop(['binary_status'], axis=1)
y = clean_df['binary_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#splitting 75/25

In [None]:
y_train.value_counts() #not a bad divide so no smote being used

In [None]:
#creating a pipeline

In [None]:
def grab_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

In [None]:
GrabNumeric = FunctionTransformer(grab_numeric)

In [None]:
pipe = Pipeline(steps=[('num', GrabNumeric),
                       ('ss', StandardScaler())])

In [None]:
pipe.fit(X_train)

In [None]:
pipe.transform(X_train)

In [None]:
X.head()

In [None]:
X.dtypes

In [None]:
subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer()),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [None]:
num_cols = ['population', 'amount_tsh']
cat_cols = ['construction_year', 'permit', 'funder', 'installer', 'region_code',
           'district_code', 'scheme_management', 'permit', 'construction_year', 
           'extraction_type_class', 'management', 'payment', 'quality_group',
           'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group']

In [None]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_cols),
                                         ('subpipe_cat', subpipe_cat, cat_cols)],
                           remainder='passthrough')

In [None]:
# The `ColumnTransformer` will take care of our preprocessing,
# so now we can add our model at the end of the pipeline.

logreg_model_pipe = Pipeline(steps=[('ct', CT),
                            ('logreg', LogisticRegression(random_state=42))])
    

In [None]:
logreg_model_pipe.fit(X_train, y_train)

In [None]:
logreg_model_pipe.score(X_train, y_train) #very bad score

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
log_pipe = ModelWithCV(logreg_model_pipe, model_name='log_pipe', X=X_train, y=y_train)

In [None]:
fig, ax = plt.subplots()

log_pipe.plot_cv(ax=ax);

In [None]:
log_pipe.print_cv_summary()