# Análisis de Datos de Fraude en Seguros de Auto

## 2. Procesamiento de los Datos

### 2.1 Carga de Librerías

In [25]:
# Cargamos las librerías a utilizar
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Para mostrar todas las columnas
pd.set_option('display.max_columns', None)

# Para mostar las cifras en formato float
pd.options.display.float_format = '{:.2f}'.format

### 2.2 Carga de Datos

In [26]:
# Asignamos los datos a un dataframe
car_fraud = pd.read_csv('.\..\data\Dataset.csv')

# Mostramos el encabezado de los datos
car_fraud.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,ClaimSize
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21.0,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1.0,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,55526.07
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34.0,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4.0,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,59294.46
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47.0,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3.0,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,71756.94
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65.0,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2.0,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,7584.15
4,Feb,1,Saturday,Honda,Urban,Monday,Feb,3,Male,Married,36.0,Third Party,Sport - Collision,Sport,more than 69000,0,7,14,400,1.0,more than 30,more than 30,1,7 years,36 to 40,No,No,External,1 to 2,no change,1 vehicle,1994,Collision,98545.37


### 2.3 Limpieza de Datos

In [27]:
# Mostramos la suma de valores nulos

car_fraud.isnull().sum()

Month                   0
WeekOfMonth             0
DayOfWeek               0
Make                    0
AccidentArea            0
DayOfWeekClaimed        0
MonthClaimed            0
WeekOfMonthClaimed      0
Sex                     0
MaritalStatus           0
Age                     5
Fault                   0
PolicyType              0
VehicleCategory         0
VehiclePrice            0
FraudFound_P            0
PolicyNumber            0
RepNumber               0
Deductible              0
DriverRating            6
Days_Policy_Accident    0
Days_Policy_Claim       0
PastNumberOfClaims      0
AgeOfVehicle            0
AgeOfPolicyHolder       0
PoliceReportFiled       0
WitnessPresent          0
AgentType               0
NumberOfSuppliments     0
AddressChange_Claim     0
NumberOfCars            0
Year                    0
BasePolicy              0
ClaimSize               0
dtype: int64

In [28]:
# Imputamos los valores nulos
# Esto también se puede realizar con pipeline SimpleImputer o KNNImputer

median_age = car_fraud['Age'].median()
car_fraud['Age'] = car_fraud['Age'].fillna(median_age)
median_DR = car_fraud['DriverRating'].median()
car_fraud['DriverRating'] = car_fraud['DriverRating'].fillna(median_DR)
car_fraud.isnull().any()

Month                   False
WeekOfMonth             False
DayOfWeek               False
Make                    False
AccidentArea            False
DayOfWeekClaimed        False
MonthClaimed            False
WeekOfMonthClaimed      False
Sex                     False
MaritalStatus           False
Age                     False
Fault                   False
PolicyType              False
VehicleCategory         False
VehiclePrice            False
FraudFound_P            False
PolicyNumber            False
RepNumber               False
Deductible              False
DriverRating            False
Days_Policy_Accident    False
Days_Policy_Claim       False
PastNumberOfClaims      False
AgeOfVehicle            False
AgeOfPolicyHolder       False
PoliceReportFiled       False
WitnessPresent          False
AgentType               False
NumberOfSuppliments     False
AddressChange_Claim     False
NumberOfCars            False
Year                    False
BasePolicy              False
ClaimSize 

In [29]:
# Reemplazamos los valores iguales a cero

car_fraud['DayOfWeekClaimed'] = car_fraud['DayOfWeekClaimed'].replace('0', 'Monday')
car_fraud['MonthClaimed'] = car_fraud['MonthClaimed'].replace('0', 'Jan')

car_fraud['Age'] = car_fraud['Age'].replace(0, median_age)

mean_ClaimSize = car_fraud['ClaimSize'].mean()
car_fraud['ClaimSize'] = car_fraud['ClaimSize'].replace(0, mean_ClaimSize)

print(car_fraud.groupby('DayOfWeekClaimed').size())
print(car_fraud.groupby('MonthClaimed').size())
print(car_fraud.groupby('Age').size())
print(car_fraud.groupby('ClaimSize').size())

DayOfWeekClaimed
Friday       1842
Monday       2812
Saturday       94
Sunday         40
Thursday     2000
Tuesday      2545
Wednesday    2232
dtype: int64
MonthClaimed
Apr     970
Aug     846
Dec     879
Feb     955
Jan    1085
Jul     908
Jun     967
Mar    1005
May    1038
Nov     973
Oct     995
Sep     944
dtype: int64
Age
16.00     8
17.00     5
18.00    34
19.00    24
20.00    15
         ..
76.00    32
77.00    20
78.00    26
79.00    12
80.00    25
Length: 65, dtype: int64
ClaimSize
504.58       4
512.67       1
532.51       1
538.14       5
542.63       2
            ..
100701.44    1
103375.05    1
103420.79    1
104194.70    1
141394.16    1
Length: 3332, dtype: int64


In [30]:
# Validamos el número de filas y columnas

car_fraud.shape

(11565, 34)

### 2.4 División de los Datos

In [31]:
# Dividimos los datos en X y y
X = car_fraud.drop(['FraudFound_P', 'PolicyNumber'], axis = 1)
y = car_fraud['FraudFound_P']

print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (11565, 32)
y: (11565,)


### 2.5 Codificación de Datos Categóricos

In [32]:
# Ordenamos las variables categóricas ordinales y nominales

Month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
DayOfWeek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
Make = ['Accura', 'BMW', 'Chevrolet', 'Dodge', 'Ferrari', 'Ford', 'Honda', 'Jaguar', 'Lexus', 'Mazda', 'Mecedes', 'Mercury', 'Nisson', 'Pontiac', 'Porche', 'Saab', 'Saturn', 'Toyota', 'VW']
AccidentArea = ['Rural', 'Urban']
DayOfWeekClaimed = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
MonthClaimed = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
Sex = ['Male', 'Female']
MaritalStatus = ['Single', 'Married', 'Widow', 'Divorced']
Fault = ['Policy Holder', 'Third Party']
PolicyType = ['Sport - Liability', 'Sport - Collision', 'Sedan - Liability', 'Sedan - All Perils', 'Sedan - Collision', 'Utility - Collision', 'Utility - Liability', 'Utility - All Perils', 'Sport - All Perils']
VehicleCategory = ['Sport', 'Sedan', 'Utility']
VehiclePrice = ['less than 20000', '20000 to 29000', '30000 to 39000', '40000 to 59000', '60000 to 69000', 'more than 69000']
Days_Policy_Accident = ['none', '1 to 7', '8 to 15', '15 to 30', 'more than 30']
Days_Policy_Claim = ['none', '8 to 15', '15 to 30', 'more than 30']
PastNumberOfClaims = ['none', '1', '2 to 4', 'more than 4']
AgeOfVehicle = ['new', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', 'more than 7']
AgeOfPolicyHolder = ['16 to 17', '18 to 20', '21 to 25', '26 to 30', '31 to 35', '36 to 40', '41 to 50', '51 to 65', 'over 65']
PoliceReportFiled = ['No', 'Yes']
WitnessPresent = ['No', 'Yes']
AgentType = ['External', 'Internal']
NumberOfSuppliments = ['none', '1 to 2', '3 to 5', 'more than 5']
AddressChange_Claim = ['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years']
NumberOfCars = ['1 vehicle', '2 vehicles', '3 to 4', '5 to 8', 'more than 9']
BasePolicy = ['Liability', 'Collision', 'All Perils']

In [33]:
# Aplicamos el codificador OrdinalEncoder a las variables categóricas ordinales y nominales

from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories = [Month,
                                  DayOfWeek,
                                  Make,
                                  AccidentArea,
                                  DayOfWeekClaimed,
                                  MonthClaimed,
                                  Sex,
                                  MaritalStatus,
                                  Fault,
                                  PolicyType,
                                  VehicleCategory,
                                  VehiclePrice,
                                  Days_Policy_Accident,
                                  Days_Policy_Claim,
                                  PastNumberOfClaims,
                                  AgeOfVehicle,
                                  AgeOfPolicyHolder,
                                  PoliceReportFiled,
                                  WitnessPresent,
                                  AgentType,
                                  NumberOfSuppliments,
                                  AddressChange_Claim,
                                  NumberOfCars,
                                  BasePolicy],
                    handle_unknown='use_encoded_value', unknown_value=-1)

### 2.6 Escalamiento de Variables

In [34]:
# Estandarizamos los datos númericos utilizando StandardScaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

### 2.7 Sesgo / Asimetría

In [35]:
# Calculamos el sesgo o asimietría de las variables numericas NO categóricas
# Inclinación positiva (derecha) o negativa (izquierda)
# Los valores más cercanos a cero tienen menos sesgo

car_fraud[['Age', 'ClaimSize']].skew()

Age         0.73
ClaimSize   1.19
dtype: float64

In [36]:
# Aplicamos la transformación de Box-Cox o Yeo-Johnson a los atributos con sesgo
# box-cox (valores positivos)
# yeo-johnson (valores negativos)

from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='box-cox', standardize=True)

### 2.8 Transformación de Columnas

In [37]:
# Aplicamos el transformador make_column_transformer

from sklearn.compose import make_column_transformer

ct = make_column_transformer(
    (oe, ['Month',
          'DayOfWeek',
          'Make',
          'AccidentArea',
          'DayOfWeekClaimed',
          'MonthClaimed',
          'Sex',
          'MaritalStatus',
          'Fault',
          'PolicyType',
          'VehicleCategory',
          'VehiclePrice',
          'Days_Policy_Accident',
          'Days_Policy_Claim',
          'PastNumberOfClaims',
          'AgeOfVehicle',
          'AgeOfPolicyHolder',
          'PoliceReportFiled',
          'WitnessPresent',
          'AgentType',
          'NumberOfSuppliments',
          'AddressChange_Claim',
          'NumberOfCars',
          'BasePolicy']),
    (scaler, ['WeekOfMonth',
              'WeekOfMonthClaimed',
              'RepNumber',
              'Deductible',
              'DriverRating',
              'Year',]),
    (pt, ['Age',
          'ClaimSize']),
     remainder = 'passthrough')

In [38]:
# Ajustamos y transformamos todo el conjunto de datos de X

X = pd.DataFrame(ct.fit_transform(X),
                       columns = ['Month',
                                  'DayOfWeek',
                                  'Make',
                                  'AccidentArea',
                                  'DayOfWeekClaimed',
                                  'MonthClaimed',
                                  'Sex',
                                  'MaritalStatus',
                                  'Fault',
                                  'PolicyType',
                                  'VehicleCategory',
                                  'VehiclePrice',
                                  'Days_Policy_Accident',
                                  'Days_Policy_Claim',
                                  'PastNumberOfClaims',
                                  'AgeOfVehicle',
                                  'AgeOfPolicyHolder',
                                  'PoliceReportFiled',
                                  'WitnessPresent',
                                  'AgentType',
                                  'NumberOfSuppliments',
                                  'AddressChange_Claim',
                                  'NumberOfCars',
                                  'BasePolicy',
                                  'WeekOfMonth',
                                  'WeekOfMonthClaimed',
                                  'RepNumber',
                                  'Deductible',
                                  'DriverRating',
                                  'Year',
                                  'Age',
                                  'ClaimSize'])
X.shape

(11565, 32)

In [39]:
# Mostramos un encabezado de los datos

X.head().round(2)

Unnamed: 0,Month,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,Sex,MaritalStatus,Fault,PolicyType,VehicleCategory,VehiclePrice,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,BasePolicy,WeekOfMonth,WeekOfMonthClaimed,RepNumber,Deductible,DriverRating,Year,Age,ClaimSize
0,11.0,2.0,6.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,4.0,3.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.72,-1.35,0.76,-2.48,-1.34,-1.08,-2.19,1.26
1,0.0,2.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,4.0,3.0,0.0,5.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.17,1.03,1.41,-0.18,1.35,-1.08,-0.45,1.3
2,9.0,4.0,6.0,1.0,3.0,10.0,0.0,1.0,0.0,1.0,0.0,5.0,4.0,3.0,1.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.72,-0.56,-0.32,-0.18,0.45,-1.08,0.65,1.43
3,5.0,5.0,17.0,0.0,4.0,6.0,0.0,1.0,1.0,2.0,0.0,1.0,4.0,3.0,1.0,7.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,-0.61,-1.35,-0.97,-0.18,-0.44,-1.08,1.7,-0.22
4,1.0,5.0,6.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,4.0,3.0,1.0,6.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.39,0.24,1.2,-0.18,-1.34,-1.08,-0.25,1.65


### 2.9 Sobre muestreo de los datos

In [40]:
# Aplicamos sobre muestreo para balancear la clase usando SMOTE

#pip install imbalanced-learn

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=7)
X, y = smote.fit_resample(X, y)

# Validamos el número de filas y columnas
print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (21760, 32)
y: (21760,)


### 3.3 Reducción de la Dimensionalidad

In [41]:
# Reducimos dimensiones aplicando Análisis de Componentes Principales (PCA)

from sklearn.decomposition import PCA

pca = PCA(n_components=15)
X_pc = pca.fit_transform(X)

print(f"Explained Variance: {pca.explained_variance_ratio_}")
print(f"Sum Explained Variance: {pca.explained_variance_ratio_.sum()}")
#print(f"Components: {pca.components_}")

X_pc = pd.DataFrame(data=X_pc,
                 columns=['PC1',
                          'PC2',
                          'PC3',
                          'PC4',
                          'PC5',
                          'PC6',
                          'PC7',
                          'PC8',
                          'PC9',
                          'PC10',
                          'PC11',
                          'PC12',
                          'PC13',
                          'PC14',
                          'PC15'])
X_pc.shape

Explained Variance: [0.38734662 0.26710252 0.05531522 0.04957243 0.03089106 0.02943655
 0.02478591 0.0204299  0.01679259 0.01441101 0.01377159 0.01181576
 0.01152447 0.01122732 0.01087215]
Sum Explained Variance: 0.955295100360319


(21760, 15)

In [42]:
X_pc.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,4.24,0.47,5.16,2.26,2.79,7.22,1.35,-2.08,-2.55,-0.54,-1.24,-1.59,1.78,0.44,0.51
1,4.48,-7.28,1.67,1.31,2.14,0.17,-1.89,-0.19,-2.14,0.73,-0.02,-2.47,-0.69,0.37,-0.11
2,3.94,6.13,0.06,-0.99,3.09,-1.48,0.95,-0.35,-2.92,0.36,-0.7,-1.36,0.14,-1.12,0.01
3,-7.37,0.2,-2.31,-2.6,1.03,-1.55,1.84,-1.83,-0.55,-1.86,-0.28,-0.63,0.38,-1.17,0.5
4,4.36,-5.87,0.95,-2.01,3.06,-0.05,-1.75,-1.09,-1.6,-1.11,0.15,-1.08,1.26,0.94,-0.19


## 3. Tratamiento de los Datos

### 3.1 Selección de modelos

In [54]:
# Calculamos el Accuracy utilizando Cross Validation KFold (regresión) de todo el conjunto de datos X y y

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

kf = KFold(n_splits=5, shuffle=False)
lgr = LogisticRegression(max_iter=1000) #,class_weight='balanced'
scores = cross_val_score(lgr, X_pc, y, cv=kf, scoring='recall')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 62.89%


In [55]:
# Calculamos el Accuracy utilizando Cross Validation StratifiedKFold (clasificación) de todo el conjunto de datos X y y

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=False)
lgr = LogisticRegression(max_iter=1000) #,class_weight='balanced'
scores = cross_val_score(lgr, X_pc, y, cv=skf, scoring='recall')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 68.64%


## 4. Modelado

### 4.1 Lineales

In [61]:
# Calculamos el Accuracy utilizando Cross Validation KFold (regresión) de todo el conjunto de datos X y y

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

kf = KFold(n_splits=10, shuffle=False)
lgr = LogisticRegression(max_iter=1000) #, class_weight='balanced'
scores = cross_val_score(lgr, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 48.24%


### 4.2 No Lineales

In [60]:
# K vecinos más cercanos (KNN)

from sklearn.neighbors import KNeighborsClassifier

kf = KFold(n_splits=10, shuffle=False)
knc = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
scores = cross_val_score(knc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 82.61%


In [62]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

kf = KFold(n_splits=10, shuffle=False)
gnb = GaussianNB()
scores = cross_val_score(gnb, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 59.40%


In [91]:
# Árboles de Decisión (CART)

from sklearn.tree import DecisionTreeClassifier

kf = KFold(n_splits=10, shuffle=False)
dtc = DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(dtc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 83.13%


In [54]:
# Máquina de Soporte Vectorial (SVM)

from sklearn.svm import SVC

kf = KFold(n_splits=10, shuffle=False)
svc = SVC()
scores = cross_val_score(svc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}%")

Accuracy: 94.08%


### 4.3 Evaluación de modelos

In [56]:
# Obtenemos la media y la desviación estandar del Accuracy por cada uno de los modelos

models = []
models.append(('LoR', LogisticRegression(max_iter=1000)))
models.append(('k-NN', KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier(criterion='entropy')))
models.append(('SVM', SVC()))

scores = []
names = []
for name, model in models:
    kf = KFold(n_splits=10, shuffle=False)
    cvs = cross_val_score(model, X_pc, y, cv=kf, scoring='accuracy')
    scores.append(name)
    print(f"{name}: {cvs.mean()*100.0:,.2f}% ({cvs.std()*100.0:,.2f}%)")

LoR: 94.08% (0.81%)
k-NN: 93.71% (0.95%)
NB: 91.83% (1.47%)
CART: 88.58% (1.40%)
SVM: 94.08% (0.81%)


### 4.4 Ensamble Bagging

In [90]:
# Bagged Decision Trees

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

seed = 7
trees = 10
kf = KFold(n_splits=5, shuffle=False)
dtc = DecisionTreeClassifier(criterion='entropy')
bc = BaggingClassifier(base_estimator=dtc, n_estimators=trees, random_state=seed)
scores = cross_val_score(bc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 77.04% (12.78%)


In [92]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

seed = 7
trees = 10
features = 3
kf = KFold(n_splits=5, shuffle=False)
rfc = RandomForestClassifier(n_estimators=trees, max_features=features, random_state=seed)
scores = cross_val_score(rfc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 78.89% (9.19%)


In [93]:
# Extra Trees

from sklearn.ensemble import ExtraTreesClassifier

seed = 7
trees = 10
features = 3
kf = KFold(n_splits=5, shuffle=False)
et = ExtraTreesClassifier(n_estimators=trees, max_features=features, random_state=seed)
scores = cross_val_score(et, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 84.11% (7.63%)


### 4.5 Ensamble Boosting

In [96]:
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

seed = 7
trees = 10
kf = KFold(n_splits=5, shuffle=False)
ab = AdaBoostClassifier(n_estimators=trees, random_state=seed)
scores = cross_val_score(ab, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 51.11% (9.91%)


In [97]:
# Gradient Boosting Machine (GBM)

from sklearn.ensemble import GradientBoostingClassifier

seed = 7
trees = 10
kf = KFold(n_splits=5, shuffle=False)
gbc = GradientBoostingClassifier(n_estimators=trees, random_state=seed)
scores = cross_val_score(gbc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 26.56% (21.24%)


In [101]:
# Voting

from sklearn.ensemble import VotingClassifier

kf = KFold(n_splits=5, shuffle=False)

lg = LogisticRegression(max_iter=1000)
knc = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
gnb = GaussianNB()

vc = VotingClassifier([('clf1', lg), ('clf2', knc), ('clf3', gnb)], voting='soft')


scores = cross_val_score(vc, X_pc, y, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores.mean()*100.0:,.2f}% ({scores.std()*100.0:,.2f}%)")

Accuracy: 67.31% (23.31%)
