In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
from pprint import pprint
import random
# import time

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

# 資料前處理

1. 將 label 撈出來
2. 刪除 'policy_id' 和 'is_claim' 欄位
3. 對資料集連續變數的欄位進行離散化
4. 對資料集類別變數的部分進行 label encoding
5. 將訓練資料隨機排序後以 8:2 的比例拆分成 train_x, test_x, train_y, test_y

In [2]:
def preprocessing(train_df):
    label = train_df["is_claim"]
    train_df = train_df.drop(["policy_id", "is_claim"], axis=1)
    # numerical feature -> categorical feature (discretize)
    for column in train_df.columns:
        if train_df[column].dtypes != 'O':
            train_df[column] = pd.cut(train_df[column], 22, labels=False, duplicates='drop')

    le = LabelEncoder()
    for column in train_df.columns:
        if train_df[column].dtypes == 'O':
            train_df[column] = le.fit_transform(train_df[column])


    # train.csv -> train, validation, test
    # split test using index suffle or index sample
    # suffle first
    # index = list(range(len(train_df)))
    index = np.arange(len(train_df))
    random.seed(1)
    random.shuffle(index) # inplace
    # split by proportion 8:2
    split = np.ceil(len(train_df)*0.8).astype(int)
    train_x = train_df.loc[index[:split],]
    test_x = train_df.loc[index[split:]]
    train_y = label.loc[index[:split],]
    test_y = label.loc[index[split:]]
    # split validation

    return train_x, test_x, train_y, test_y, train_df

train_df = pd.read_csv("./data/train.csv")
train_x, test_x, train_y, test_y, train_df = preprocessing(train_df)

# Classification Task

# Naive Bayes Classifier

- method1 的 Naive Bayes Classifier 需要跑 48 分鐘，且 predictions 有點奇怪
- method2 參考自 vamc-stash 的 Naive-Bayes
    > https://github.com/vamc-stash/Naive-Bayes/blob/master/src/naive_bayes.py

In [None]:
# ## Naive Bayes Classifier (method 1)
# class NaiveBayesClassifier:
#     '''
#     P(y|X) = P(X|y) * P(y) / P(X)
#     '''
#     def __init__(self) -> None:
#         pass

#     def calc_prior(self, features, target):
#         '''
#         prior = P(class) = count(y) / n
#         '''
#         self.prior = (features.groupby(target).apply(lambda x: len(x)) / features.shape[0]).to_numpy()
#         return self.prior

#     def calc_likelihood(self, x):
#         '''
#         P(X_1|y)*...*P(X_n|y)
#         P(X_i|y) = count(X_i, y) / count(y)
#         '''
#         features = list(x.columns)
#         for feature in features:
#             for outcome in np.unique(self.y):
#                 n_outcome = sum(self.y == outcome)
#                 # feat_likelihood = 


#         # ---
#         tmp_train_x = pd.concat([self.X, self.y], axis=1)
#         length_of_each_class = self.X.groupby(self.y, group_keys=False).apply(lambda x: len(x))
#         self.likelihood = [] # len(self.likelihood)=n_classes
#         for i in range(self.n_classes):
#             count = 1
#             for j in range(len(features)):
#                 count *= self.X.iloc[:, j].groupby(self.y, group_keys=False).get_group(self.classes[i]).isin([features[j]]).sum()
#                 # count *= tmp_train_x.loc[pd.Series(tmp_train_x.iloc[:, j]==features[j]) & pd.Series(tmp_train_x.iloc[:,-1]==self.classes[i])].shape[0]
#             conditional = count / length_of_each_class[self.classes[i]]**len(features)
#             self.likelihood.append(conditional)
#         return self.likelihood

#     def calc_posterior(self, x):
#         '''
#         posterior = P(class|features)
#         likelihood = P(features|class)
#         '''
#         posteriors = [] # len(posteriors)=n_classes

#         # calculate posterior probability for each class
#         posteriors = np.multiply(self.prior, self.calc_likelihood(x))
#         return self.classes[np.argmax(posteriors)]

#     def fit(self, features, target):
#         self.X = features
#         self.y = target
#         self.classes = np.sort(target.unique())
#         self.n_classes = target.nunique()

#         self.calc_prior(features, target)

#     def predict(self, X):
#         preds = [self.calc_posterior(x) for x in X.to_numpy()]
#         return preds
        
#     def accuracy(self, y_test, y_pred):
#         accuracy = np.sum(y_test == y_pred) / len(y_test)
#         return accuracy


In [53]:
## Naive Bayes Classifier (method 2)
class NaiveBayesClassifier:
	def __init__(self):

			"""
				Attributes:
					likelihoods: Likelihood of each feature per class
					class_priors: Prior probabilities of classes 
					pred_priors: Prior probabilities of features 
					features: All features of dataset
			"""
			self.features = list
			self.likelihoods = {}
			self.class_priors = {}
			self.pred_priors = {}

			self.X_train = np.array
			self.y_train = np.array
			self.train_size = int
			self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({str(feat_val)+'_'+str(outcome):0})
					self.class_priors.update({outcome: 0})


		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

		# print(self.likelihoods)
		# print(self.class_priors)
		# print(self.pred_priors)

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][str(feat_val) + '_' + str(outcome)] = count/outcome_count


	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size


	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][str(feat_val) + '_' + str(outcome)]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)

		return np.array(results)

	def calculate_accuracy(self, y_true, y_pred):
		accuracy = (y_true == y_pred).mean()
		return accuracy

In [129]:
clf1 = NaiveBayesClassifier()
clf1.fit(train_x, train_y)

In [130]:
preds = clf1.predict(test_x)
# 準確率
clf1.calculate_accuracy(test_y, preds)

0.9348011606076122

In [131]:
# 混淆矩陣
confusion_matrix(test_y, preds)

array([[10954,     3],
       [  761,     0]], dtype=int64)

### Cross-Validation

### k=3

In [71]:
x, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]

model = NaiveBayesClassifier()
kf = KFold(n_splits=3)#, random_state=1, shuffle=True)
for train_index, test_index in kf.split(train_x):
    x_train, x_test = x.iloc[train_index,], x.iloc[test_index,]
    y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    # accuracy
    print(f"accuracy = {model.calculate_accuracy(y_test, preds)}")


accuracy = 0.999552
accuracy = 0.999232


KeyError: '18_0'

### k=5

In [72]:
x, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]

model = NaiveBayesClassifier()
kf = KFold(n_splits=5)#, random_state=1, shuffle=True)
for train_index, test_index in kf.split(train_x):
    x_train, x_test = x.iloc[train_index,], x.iloc[test_index,]
    y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    # accuracy
    print(f"accuracy = {model.calculate_accuracy(y_test, preds)}")

accuracy = 0.99936
accuracy = 0.9998933333333333
accuracy = 0.9991466666666666


KeyError: '10_0'

### k=10

In [73]:
x, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]

model = NaiveBayesClassifier()
kf = KFold(n_splits=10)#, random_state=1, shuffle=True)
for train_index, test_index in kf.split(train_x):
    x_train, x_test = x.iloc[train_index,], x.iloc[test_index,]
    y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    # accuracy
    print(f"accuracy = {model.calculate_accuracy(y_test, preds)}")

accuracy = 0.9997866894197952
accuracy = 0.9997866894197952
accuracy = 0.9997866894197952
accuracy = 1.0
accuracy = 0.9991465756347344
accuracy = 0.9995732878173672
accuracy = 0.9995732878173672


KeyError: '18_0'

# Random Forest Classifier
- 參考自 SebastianMantey 的 Random-Forest-from-Scratch
    > https://github.com/SebastianMantey/Random-Forest-from-Scratch

In [18]:
## Random Forest Classifier
class RandomForestClassifier_scratch:
    def __init__(self) -> None:
        # self.max_depth
        # self.n_trees
        pass

    # bagging in #data
    # #tree, #max-depth of each tree, etc. could be predefined
# ---------------------------------------------------------------------------------------------
    # 1. Train-Test-Split
    def train_test_split(self, df, test_size):
        
        if isinstance(test_size, float):
            test_size = round(test_size * len(df))

        indices = df.index.tolist()
        test_indices = random.sample(population=indices, k=test_size)

        test_df = df.loc[test_indices]
        train_df = df.drop(test_indices)
        
        return train_df, test_df


    # 2. Distinguish categorical and continuous features
    def determine_type_of_feature(self, df):
        
        feature_types = []
        n_unique_values_treshold = 15
        for feature in df.columns:
            if feature != "is_claim": # "label"
                unique_values = df[feature].unique()
                example_value = unique_values[0]

                if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                    feature_types.append("categorical")
                else:
                    feature_types.append("continuous")
        
        return feature_types


    # 3. Accuracy
    def calculate_accuracy(self, predictions, labels):
        predictions_correct = predictions == labels
        accuracy = predictions_correct.mean()
        
        return accuracy
    # ---------------------------------------------------------------------------------------------
    # 1. Decision Tree helper functions 

    # 1.1 Data pure?
    def check_purity(self, data):
        
        label_column = data[:, -1]
        unique_classes = np.unique(label_column)

        if len(unique_classes) == 1:
            return True
        else:
            return False

        
    # 1.2 Classify
    def classify_data(self, data):
        
        label_column = data[:, -1]
        unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

        index = counts_unique_classes.argmax()
        classification = unique_classes[index]
        
        return classification


    # 1.3 Potential splits?
    def get_potential_splits(self, data, random_subspace):
        
        potential_splits = {}
        _, n_columns = data.shape
        column_indices = list(range(n_columns - 1))    # excluding the last column which is the label
        
        if random_subspace and random_subspace <= len(column_indices):
            column_indices = random.sample(population=column_indices, k=random_subspace)
        
        for column_index in column_indices:          
            values = data[:, column_index]
            unique_values = np.unique(values)
            
            potential_splits[column_index] = unique_values
        
        return potential_splits


    # 1.4 Lowest Overall Entropy?
    def calculate_entropy(self, data):
        
        label_column = data[:, -1]
        _, counts = np.unique(label_column, return_counts=True)

        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -np.log2(probabilities))
        
        return entropy


    def calculate_overall_entropy(self, data_below, data_above):
        
        n = len(data_below) + len(data_above)
        p_data_below = len(data_below) / n
        p_data_above = len(data_above) / n

        overall_entropy =  (p_data_below * self.calculate_entropy(data_below) 
                        + p_data_above * self.calculate_entropy(data_above))
        
        return overall_entropy


    def determine_best_split(self, data, potential_splits):
        
        overall_entropy = 9999
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = self.split_data(data, split_column=column_index, split_value=value)
                current_overall_entropy = self.calculate_overall_entropy(data_below, data_above)
                
                if current_overall_entropy <= overall_entropy:
                    overall_entropy = current_overall_entropy
                    best_split_column = column_index
                    best_split_value = value
        
        return best_split_column, best_split_value


    # 1.5 Split data
    def split_data(self, data, split_column, split_value):
        
        split_column_values = data[:, split_column]

        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            data_below = data[split_column_values <= split_value]
            data_above = data[split_column_values >  split_value]
        
        # feature is categorical   
        else:
            data_below = data[split_column_values == split_value]
            data_above = data[split_column_values != split_value]
        
        return data_below, data_above


    # 2. Decision Tree Algorithm
    def decision_tree_algorithm(self, df, counter=0, min_samples=2, max_depth=5, random_subspace=None):
        
        # data preparations
        if counter == 0:
            global COLUMN_HEADERS, FEATURE_TYPES
            COLUMN_HEADERS = df.columns
            FEATURE_TYPES = self.determine_type_of_feature(df)
            data = df.values
        else:
            data = df           
        
        
        # base cases
        if (self.check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
            classification = self.classify_data(data)
            
            return classification

        
        # recursive part
        else:    
            counter += 1

            # helper functions 
            potential_splits = self.get_potential_splits(data, random_subspace)
            split_column, split_value = self.determine_best_split(data, potential_splits)
            data_below, data_above = self.split_data(data, split_column, split_value)
            
            # check for empty data
            if len(data_below) == 0 or len(data_above) == 0:
                classification = self.classify_data(data)
                return classification
            
            # determine question
            feature_name = COLUMN_HEADERS[split_column]
            type_of_feature = FEATURE_TYPES[split_column]
            if type_of_feature == "continuous":
                question = "{} <= {}".format(feature_name, split_value)
                
            # feature is categorical
            else:
                question = "{} = {}".format(feature_name, split_value)
            
            # instantiate sub-tree
            sub_tree = {question: []}
            
            # find answers (recursion)
            yes_answer = self.decision_tree_algorithm(data_below, counter, min_samples, max_depth, random_subspace)
            no_answer = self.decision_tree_algorithm(data_above, counter, min_samples, max_depth, random_subspace)
            
            # If the answers are the same, then there is no point in asking the qestion.
            # This could happen when the data is classified even though it is not pure
            # yet (min_samples or max_depth base case).
            if yes_answer == no_answer:
                sub_tree = yes_answer
            else:
                sub_tree[question].append(yes_answer)
                sub_tree[question].append(no_answer)
            
            return sub_tree


    # 3. Make predictions
    # 3.1 One example
    def predict_example(self, example, tree):
        # if not isinstance(tree, dict): # 樹長不出來???
        #     return tree
            
        question = list(tree.keys())[0]
        feature_name, comparison_operator, value = question.split(" ")

        # ask question
        if comparison_operator == "<=":
            if example[feature_name] <= float(value):
                answer = tree[question][0]
            else:
                answer = tree[question][1]
        
        # feature is categorical
        else:
            if str(example[feature_name]) == value:
                answer = tree[question][0]
            else:
                answer = tree[question][1]

        # base case
        if not isinstance(answer, dict):
            return answer
        
        # recursive part
        else:
            residual_tree = answer
            return self.predict_example(example, residual_tree)

        
    # 3.2 All examples of the test data
    def decision_tree_predictions(self, test_df, tree):
        predictions = test_df.apply(self.predict_example, args=(tree,), axis=1)
        return predictions
# ---------------------------------------------------------------------------------------------
    def bootstrapping(self, train_df, n_bootstrap):
        bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
        df_bootstrapped = train_df.iloc[bootstrap_indices]
        
        return df_bootstrapped

    def random_forest_algorithm(self, train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
        forest = []
        for i in range(n_trees):
            df_bootstrapped = self.bootstrapping(train_df, n_bootstrap)
            tree = self.decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
            forest.append(tree)

        return forest

    def random_forest_predictions(self, test_df, forest):
        df_predictions = {}
        for i in range(len(forest)):
            column_name = "tree_{}".format(i)
            predictions = self.decision_tree_predictions(test_df, tree=forest[i])
            df_predictions[column_name] = predictions

        df_predictions = pd.DataFrame(df_predictions)
        random_forest_predictions = df_predictions.mode(axis=1)[0]
        
        return random_forest_predictions

# ===================================================================================================
# forest = random_forest_algorithm(train_df, n_trees=4, n_bootstrap=800, n_features=2, dt_max_depth=4)
# predictions = random_forest_predictions(test_df, forest)
# accuracy = calculate_accuracy(predictions, test_df.label)

# print("Accuracy = {}".format(accuracy))

In [19]:
# ###################################### 舊的 (method1) Naive Bayes Classifier 部分

In [20]:
# clf1 = NaiveBayesClassifier()
# clf1.calc_prior(train_x, train_y)
# clf1.fit(train_x, train_y)

In [21]:
# start_time = time.time()
# preds = clf1.predict(test_x)
# print(f"--- {time.time() - start_time} sec ---")
# np.save("./output/predictions1.npy", preds)

In [372]:
# preds = np.load("./output/predictions1.npy")
# accuracy = clf1.accuracy(test_y, preds)
# print("Accuracy = {}".format(accuracy))

# confusion = confusion_matrix(test_y, preds)
# print("Confusion = ")
# print(confusion)

Accuracy = 0.9350571769926609
Confusion = 
[[10957     0]
 [  761     0]]


In [None]:
# ##########################################################################

In [2]:
# train_df = pd.concat([train_df, pd.concat([train_y, test_y])], axis=1)
# train_df.head()

In [7]:
# 整理一下放入隨機森林模型的 data 格式
train_x_y = pd.concat([train_x, train_y], axis=1)
test_x_y = pd.concat([test_x, test_y], axis=1)

In [22]:
clf2 = RandomForestClassifier_scratch()
# forest = clf2.random_forest_algorithm(train_df, n_trees=4, n_bootstrap=train_df.shape[0], n_features=8, dt_max_depth=8)
forest = clf2.random_forest_algorithm(train_x_y, n_trees=10, n_bootstrap=train_x_y.shape[0]//2, n_features=8, dt_max_depth=8)

In [13]:
# forest
pprint(forest)

[{'population_density = 10': [{'policy_tenure <= 9': [0,
                                                      {'engine_type = 7': [0,
                                                                           {'is_day_night_rear_view_mirror = 1': [0,
                                                                                                                  {'segment = 0': [0,
                                                                                                                                   {'fuel_type = 2': [{'age_of_policyholder <= 5': [0,
                                                                                                                                                                                    {'area_cluster <= 3': [1,
                                                                                                                                                                                                           0]}]},
               

In [14]:
# one tree of forest
pprint(forest[0])

{'population_density = 10': [{'policy_tenure <= 9': [0,
                                                     {'engine_type = 7': [0,
                                                                          {'is_day_night_rear_view_mirror = 1': [0,
                                                                                                                 {'segment = 0': [0,
                                                                                                                                  {'fuel_type = 2': [{'age_of_policyholder <= 5': [0,
                                                                                                                                                                                   {'area_cluster <= 3': [1,
                                                                                                                                                                                                          0]}]},
                      

In [16]:
preds = clf2.random_forest_predictions(test_x_y, forest)
# accuracy = clf2.calculate_accuracy(preds, test_y)
accuracy = clf2.calculate_accuracy(preds, test_x_y.iloc[:, -1])
print("Accuracy = {}".format(accuracy))
# confusion = confusion_matrix(test_y, preds)
confusion = confusion_matrix(test_x_y.iloc[:, -1], preds)
print("Confusion = ")
print(confusion)

Accuracy = 0.9350571769926609
Confusion = 
[[10957     0]
 [  761     0]]


# Random Forest Classifier (scikit-learn)

In [24]:
# scikit-learn
# Random Forest Classifier
clf3 = RandomForestClassifier(criterion='entropy', random_state=1)
clf3.fit(train_x, train_y)

In [25]:
preds = clf3.predict(test_x)
# accuracy
accuracy = clf3.score(test_x, test_y)
print(f"accuracy = {accuracy}")

accuracy = 0.9202935654548557


In [29]:
# confusion matrix
confusion_matrix(test_y, preds)

array([[10771,   186],
       [  748,    13]], dtype=int64)

### Cross-Validation

### k=3

In [74]:
model = RandomForestClassifier(criterion='entropy', random_state=1)
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.919168   0.923392   0.92197901]


### k=5

In [75]:
model = RandomForestClassifier(criterion='entropy', random_state=1)
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.91968    0.92053333 0.92469333 0.92181333 0.92361852]


### k=10

In [76]:
model = RandomForestClassifier(criterion='entropy', random_state=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.92150171 0.91702218 0.91467577 0.9253413  0.92404523 0.92425859
 0.92063153 0.92063153 0.92575208 0.92404523]


# XGBoost

In [29]:
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix, cv

In [13]:
# create model instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(train_x, train_y)
# make predictions
preds = bst.predict(test_x)

In [14]:
# accuracy
print(f"accuracy = {accuracy_score(test_y, preds)}")

accuracy = 0.9350571769926609


In [15]:
# confusion matrix
confusion_matrix(test_y, preds)

array([[10957,     0],
       [  761,     0]], dtype=int64)

## Cross-Validation

### k=3

In [42]:
bst = XGBClassifier()
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
results = cross_val_score(bst, train_x, train_y, cv=kfold)
print(results)

[0.933056   0.938816   0.93554788]


### k=5

In [43]:
bst = XGBClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
results = cross_val_score(bst, train_x, train_y, cv=kfold)
print(results)

[0.93301333 0.93493333 0.93738667 0.93653333 0.93759334]


### k=10

In [44]:
bst = XGBClassifier()
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
results = cross_val_score(bst, train_x, train_y, cv=kfold)
print(results)

[0.93302048 0.93280717 0.93174061 0.93856655 0.93770002 0.93705995
 0.93770002 0.93577982 0.93791338 0.93705995]


# Catboost

In [3]:
from catboost import CatBoostClassifier # python 3.10 後 Iterable 從 collections 拿掉了，因此有 error
                                        # https://stackoverflow.com/questions/72032032/importerror-cannot-import-name-iterable-from-collections-in-python

clf4 = CatBoostClassifier(random_seed=1,
                         loss_function='RMSE',
                         eval_metric='RMSE',
                         use_best_model=True)

clf4.fit(train_x, train_y)
preds = clf4.predict(test_x)

ImportError: cannot import name 'Iterable' from 'collections' (c:\Program Files\Python311\Lib\collections\__init__.py)

In [None]:
# accuracy
print(f"accuracy = {accuracy_score(test_y, preds)}")

In [None]:
# confusion matrix
confusion_matrix(test_y, preds)

# LightGBM

In [5]:
from lightgbm import LGBMClassifier

clf5 = LGBMClassifier(objective = 'binary', 
                            learning_rate = 0.05, 
                            n_estimators = 100, 
                            random_state=1)
clf5.fit(train_x, train_y)
preds = clf5.predict(test_x)

In [6]:
# accuracy
print(f"accuracy = {accuracy_score(test_y, preds)}")

accuracy = 0.9350571769926609


In [7]:
# confusion matrix
confusion_matrix(test_y, preds)

array([[10957,     0],
       [  761,     0]], dtype=int64)

### Cross-Validation

### k=3

In [45]:
model = LGBMClassifier()
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.93344    0.9392     0.93612391]


### k=5

In [46]:
model = LGBMClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.93333333 0.93514667 0.93792    0.93728    0.93770002]


### k=10

In [47]:
model = LGBMClassifier()
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
results = cross_val_score(model, train_x, train_y, cv=kfold)
print(results)

[0.93366041 0.93323379 0.93174061 0.93877986 0.93834009 0.93748667
 0.93834009 0.93599317 0.93812673 0.93727331]
