In [3]:
from __future__ import print_function, division
import tempfile
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences

In [4]:
# read dataset
DATA_DIR = r'D:\data'
df = pd.read_csv(r'D:\data\dataset.csv')
pd.set_option('display.max_columns',None)
np.set_printoptions(threshold=np.inf) 
train = {}
train = df.to_dict()
train1 = train['Sequence']
train2 = train['Score']

In [5]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

char_to_num = {
    'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7,
    'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13,
    'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20
}

def OE1(seq_temp1):
    fea1 = [[char_to_num.get(char, 0)] for char in seq_temp1]
    return fea1

train1_oe1 = [OE1(train1[i]) for i in train1]

In [6]:
#Amino acid composition (AAC)
handcraft_AAC_test = [[0] * 20 for _ in range(len(train1_oe1))]
for row in range(len(train1_oe1)):
    seq = train1_oe1[row]
    for i in seq:
        col = i[0]-1
        handcraft_AAC_test[row][col] += 1/len(seq)
hc_AAC_test = np.array(handcraft_AAC_test)
print(hc_AAC_test.shape)
print(hc_AAC_test)

(306, 20)
[[0.         0.         0.         0.         0.         0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5        0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.5
  0.5        0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.5
  0.         0.         0.         0.         0.         0.5
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.5
  0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.5        0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        

In [7]:
#Dipeptide composition (DPC)
from collections import Counter
import numpy as np

def compute_dpc_pairs(sequence, k):
    return [sequence[i] + sequence[i + k + 1] for i in range(len(sequence) - k - 1)]

def calculate_amino_acid_pairs_frequency(sequence, max_k):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    pair_count = len(amino_acids) ** 2
    feature_vector = []

    for k in range(max_k + 1):
        dpc_pairs = compute_dpc_pairs(sequence, k)
        pair_counter = Counter(dpc_pairs)
        total_pairs = len(dpc_pairs)

        vector = [pair_counter.get(a + b, 0) / total_pairs for a in amino_acids for b in amino_acids]
        feature_vector.extend(vector)

    return feature_vector

max_k = 0
dpc_group_pairs = [calculate_amino_acid_pairs_frequency(sequence, max_k) for sequence in train1.values()]
DPC = np.array(dpc_group_pairs)

print(DPC.shape)
print("Length of feature vector:", len(dpc_group_pairs[0]))

(306, 400)
Length of feature vector: 400


In [8]:
#The One-Hot descriptor for sequences
import pandas as pd
import numpy as np

amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_index = {aa: i for i, aa in enumerate(amino_acids)}
max_length = max(df['Sequence'].apply(len))

def sequence_to_one_hot(seq):
    one_hot = np.zeros(max_length*20)
    
    for i, aa in enumerate(seq):
        if i >= 14:
            break
        if aa in amino_index:
            index = amino_index[aa] + i * 20
            one_hot[index] = 1
            
    return one_hot

sequences = df['Sequence']

one_hot_encoded = np.array([sequence_to_one_hot(seq) for seq in sequences])

print(one_hot_encoded.shape)

(306, 280)


In [9]:
#Hand-crafted features
def summarized_featuresPC(peptide_sequences):
    summarized_featuresPC = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]  # Get the last amino acid
        if last_aa == 'P':
            summarized_featuresPC.append([1,0])  #If the last amino acid is Pro
        elif last_aa == 'C':
            summarized_featuresPC.append([0,1])
        else:
            summarized_featuresPC.append([0,0])
    return summarized_featuresPC
def summarized_featuresF(peptide_sequences):
    summarized_featuresF = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        first_aa = peptide_sequences[i][-1]
        first3_aa = peptide_sequences[i][-3:]
        if last_aa == 'F':
            if first3_aa == 'CRG':
                summarized_featuresF.append([1,0,1,1])
            elif first3_aa == 'YRG':
                summarized_featuresF.append([0,1,1,1])
            elif first_aa == 'G':
                summarized_featuresF.append([0,0,1,1])
            else:
                summarized_featuresF.append([0,0,0,1])
        else:
            summarized_featuresF.append([0,0,0,0])
    return summarized_featuresF
def summarized_featuresM(peptide_sequences):
    summarized_featuresM = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        first_aa = peptide_sequences[i][-1]
        first6_aa = peptide_sequences[i][-6:]
        if last_aa == 'M':
            if first6_aa == 'PNSFEG':
                summarized_featuresM.append([1,1,1])
            elif first_aa == 'G':
                summarized_featuresM.append([1,0,1])
            else:
                summarized_featuresM.append([0,0,1])
        else:
            summarized_featuresM.append([0,0,0])
    return summarized_featuresM
def summarized_featuresT(peptide_sequences):
    summarized_featuresT = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        last2_aa = peptide_sequences[i][0:2]
        first_aa = peptide_sequences[i][-1]
        first2_aa = peptide_sequences[i][-2:]
        if last_aa == 'T':
            if last2_aa == 'TD':
                if first2_aa == 'GG':
                    summarized_featuresT.append([1,1,0,0,1,1])
                elif first_aa == 'G':
                    summarized_featuresT.append([1,0,0,0,1,1])
                else:
                    summarized_featuresT.append([0,0,0,0,1,1])
            elif first2_aa == 'DG':
                summarized_featuresT.append([0,0,1,0,0,1])
            elif first2_aa == 'FG':
                summarized_featuresT.append([0,0,0,1,0,1])
            elif first_aa == 'G':
                summarized_featuresT.append([1,0,0,0,0,1])
            else:
                summarized_featuresT.append([0,0,0,0,0,1])
        else:
            summarized_featuresT.append([0,0,0,0,0,0])
    return summarized_featuresT
def summarized_featuresG(peptide_sequences):
    summarized_featuresG = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        if last_aa == 'G':
            summarized_featuresG.append(1)
        else:
            summarized_featuresG.append(0)
    return summarized_featuresG


featuresPC = summarized_featuresPC(train1)
featuresF = summarized_featuresF(train1)
featuresM = summarized_featuresM(train1)
featuresT = summarized_featuresT(train1)
featuresG = summarized_featuresG(train1)


print("Generated featuresPC:", featuresPC)
print("Generated featuresF:", featuresF)
print("Generated featuresM:", featuresM)
print("Generated featuresT:", featuresT)
print("Generated featuresG:", featuresG)
Generated_features = np.c_[featuresPC,featuresF,featuresM,featuresT,featuresG]
print(Generated_features.shape)

Generated featuresPC: [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [

In [10]:
#Concatenation
from sklearn.preprocessing import MinMaxScaler
import joblib
X_train = np.c_[Generated_features, hc_AAC_test, DPC, one_hot_encoded]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(X_train)
X_scaled = scaler.fit_transform(X_train)
df2 = pd.DataFrame(X_scaled)
df2_non_zero = df2.loc[:, (df2 != 0).any(axis=0)]
removed_indices = df2.columns[~df2.columns.isin(df2_non_zero.columns)].tolist()
print("Remove:", removed_indices)
X_filtered=np.array(df2_non_zero)
joblib.dump(scaler, "scaler307.pkl")

Remove: [12, 39, 42, 45, 47, 48, 50, 51, 52, 54, 57, 58, 59, 60, 62, 63, 67, 68, 69, 71, 72, 73, 79, 83, 84, 86, 87, 89, 90, 91, 93, 94, 97, 99, 103, 104, 105, 107, 109, 113, 114, 115, 118, 122, 126, 127, 131, 133, 135, 137, 139, 147, 151, 153, 154, 156, 159, 162, 163, 165, 166, 167, 168, 169, 171, 174, 176, 179, 182, 183, 184, 186, 187, 190, 191, 193, 194, 195, 202, 203, 205, 206, 207, 208, 209, 211, 215, 216, 217, 219, 220, 222, 223, 224, 225, 226, 227, 229, 231, 232, 233, 235, 237, 238, 239, 240, 242, 246, 249, 252, 253, 254, 256, 257, 258, 259, 262, 265, 267, 268, 269, 270, 272, 273, 274, 276, 277, 279, 284, 285, 289, 290, 294, 295, 297, 298, 299, 300, 302, 304, 305, 307, 309, 310, 311, 312, 316, 319, 320, 323, 325, 326, 327, 329, 330, 332, 334, 335, 336, 337, 338, 342, 343, 344, 345, 349, 350, 351, 354, 359, 366, 367, 369, 371, 372, 373, 374, 376, 377, 382, 384, 385, 387, 388, 390, 391, 392, 393, 395, 397, 398, 399, 400, 402, 404, 405, 406, 409, 411, 412, 413, 414, 418, 419, 423, 

['scaler307.pkl']

In [11]:
y_train1=[]
count1=0
count0=0
for idx in df.index:
    y_train1 += [df['Score'].loc[idx]] 
    if df['Score'].loc[idx]==1:
        count1+=1
    else:
        count0+=1
y_train1 = np.array(y_train1, dtype = float)
print(y_train1)
print(count1)
print(count0)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
174
132


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(X_filtered, y_train1):
    x_train, x_test = X_filtered[train_index], X_filtered[test_index]
    y_train, y_test = y_train1[train_index], y_train1[test_index]
print(train_index)
print(test_index)
train1=0
train0=0
for i in y_train:
    if i==1:
        train1+=1
    else:
        train0+=1
print(train1)
print(train0)

[243  39  84   1 134 217 300 273 276 185  73 262 122   5  46 174  36 158
  52 152 210 133 101 108   7  83 291 129 274 189 303  82 279  59  29  34
 164 252  43 254 192 136  51 200 183 227  20  15 261  23  25 105 205 302
 176 194 266 203  21 209 301 223 201 110  30 155 159 116 123 222 127 187
   8  67 139  41 290  79  85  44  93  71 268 154  10  90 204 295  69 141
  50 214 229 118 168 135  48 115 153  72 126 172  57 149 144   9  98  13
 207 211 246  74 213 267 245 240 146 182  64 226  53 173 143 293  33  75
   6 286 230 269  99  11  63  91 255  22 191  68  76  96 177  94 128  95
 239 150 138 218 145 250 297 117  16 280 188 119 169 249 131 281  24 219
  47 196 206 113 231 179  28  80 142 260 271 170 285 277 140  65  62 130
  60 107 292  78   3 208 258  17 157 106 298 299 181  42 114  32 263  27
  49 264 175 242 287 237  45 221 294 278 289 100  26  35 224 120]
[ 12  66 256 184 235   0 163 272 166 186 199 193 156 296  31 284  87 241
 167 102   4 215 178 234 305 282 304 202 165 228 162  97 2

In [13]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()

nb_classifier.fit(x_train, y_train)

y_test_NB = nb_classifier.predict(x_test)
print(y_test_NB)
print(y_test)

accuracy_NB = accuracy_score(y_test, y_test_NB)
print("Accuracy：", accuracy_NB)
print(classification_report(y_test, y_test_NB))

[0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0.
 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0.
 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy： 0.6304347826086957
              precision    recall  f1-score   support

         0.0       0.56      0.70      0.62        40
         1.0       0.71      0.58      0.64        52

    accuracy                           0.63        92
   macro avg       0.64      0.64      0.63        92
weighted avg       0.65      0.63      0.63        92



In [14]:
#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=60)  

knn_classifier.fit(x_train, y_train)

y_test_KNN = knn_classifier.predict(x_test)
print(y_test_KNN)
print(y_test)

accuracy_KNN = accuracy_score(y_test, y_test_KNN)
print("Accuracy：", accuracy_KNN)
print(classification_report(y_test, y_test_KNN))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy： 0.5
              precision    recall  f1-score   support

         0.0       0.46      0.95      0.62        40
         1.0       0.80      0.15      0.26        52

    accuracy                           0.50        92
   macro avg       0.63      0.55      0.44        92
weighted avg       0.65      0.50      0.42        92



In [15]:
#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

modelGB = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)

modelGB.fit(x_train, y_train)

y_test_GB = modelGB.predict(x_test)
print(y_test_GB)
print(y_test)

accuracy_GB = accuracy_score(y_test, y_test_GB)
print("Accuracy:", accuracy_GB)

print(classification_report(y_test, y_test_GB))

[1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6847826086956522
              precision    recall  f1-score   support

         0.0       0.65      0.60      0.62        40
         1.0       0.71      0.75      0.73        52

    accuracy                           0.68        92
   macro avg       0.68      0.68      0.68        92
weighted avg       0.68      0.68      0.68        92



In [16]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier(n_estimators=50, random_state=42)
modelRF.fit(x_train, y_train)

y_test_RF = modelRF.predict(x_test)
print(y_test_RF)
print(y_test)

accuracy_RF = accuracy_score(y_test, y_test_RF)
print("Accuracy:", accuracy_RF)

print(classification_report(y_test, y_test_RF))

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6847826086956522
              precision    recall  f1-score   support

         0.0       0.66      0.57      0.61        40
         1.0       0.70      0.77      0.73        52

    accuracy                           0.68        92
   macro avg       0.68      0.67      0.67        92
weighted avg       0.68      0.68      0.68        92



In [17]:
#SVC
from sklearn.svm import SVC

modelSVC = SVC(kernel='linear')

modelSVC.fit(x_train, y_train)

y_test_SVC = modelSVC.predict(x_test)
print(y_test_SVC)
print(y_test)

accuracy_SVC = accuracy_score(y_test, y_test_SVC)
print("Accuracy:", accuracy_SVC)

print(classification_report(y_test, y_test_SVC))

[0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6195652173913043
              precision    recall  f1-score   support

         0.0       0.56      0.60      0.58        40
         1.0       0.67      0.63      0.65        52

    accuracy                           0.62        92
   macro avg       0.62      0.62      0.62        92
weighted avg       0.62      0.62      0.62        92



In [18]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression()

modelLR.fit(x_train, y_train)

y_test_LR = modelLR.predict(x_test)
print(y_test_LR)
print(y_test)

accuracy_LR = accuracy_score(y_test, y_test_LR)
print("Accuracy:", accuracy_LR)

print(classification_report(y_test, y_test_LR))

[0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6956521739130435
              precision    recall  f1-score   support

         0.0       0.66      0.62      0.64        40
         1.0       0.72      0.75      0.74        52

    accuracy                           0.70        92
   macro avg       0.69      0.69      0.69        92
weighted avg       0.69      0.70      0.69        92



In [21]:
#XGB
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)

xgb_clf.fit(x_train, y_train)

y_test_XGB = xgb_clf.predict(x_test)

accuracy_XGB = accuracy_score(y_test, y_test_XGB)
print("Accuracy:", accuracy_XGB)

print(classification_report(y_test, y_test_XGB))
print(y_test_XGB)

Accuracy: 0.6413043478260869
              precision    recall  f1-score   support

         0.0       0.60      0.53      0.56        40
         1.0       0.67      0.73      0.70        52

    accuracy                           0.64        92
   macro avg       0.63      0.63      0.63        92
weighted avg       0.64      0.64      0.64        92

[1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1.]




In [22]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_model = DecisionTreeClassifier(max_depth=1, random_state=42)

adaboost_clf = AdaBoostClassifier(base_estimator=base_model, n_estimators=100, random_state=42)

adaboost_clf.fit(x_train, y_train)

y_test_ABC = adaboost_clf.predict(x_test)

accuracy_ABC = accuracy_score(y_test, y_test_ABC)
print("Accuracy:", accuracy_ABC)

print(classification_report(y_test, y_test_ABC))
print(y_test_ABC)

Accuracy: 0.6413043478260869
              precision    recall  f1-score   support

         0.0       0.59      0.57      0.58        40
         1.0       0.68      0.69      0.69        52

    accuracy                           0.64        92
   macro avg       0.63      0.63      0.63        92
weighted avg       0.64      0.64      0.64        92

[1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1.
 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]


In [23]:

from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = SVC(random_state=42, probability=True)
clf4 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)

voting_clf1 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3), ('xgb', clf4)], voting='soft')
 
voting_clf1.fit(x_train, y_train)

y_test_VC1 = voting_clf1.predict(x_test)

accuracy_VC1 = accuracy_score(y_test, y_test_VC1)
print("Accuracy:", accuracy_VC1)

print(classification_report(y_test, y_test_VC1))
print(y_test_VC1)

Accuracy: 0.717391304347826
              precision    recall  f1-score   support

         0.0       0.72      0.57      0.64        40
         1.0       0.72      0.83      0.77        52

    accuracy                           0.72        92
   macro avg       0.72      0.70      0.70        92
weighted avg       0.72      0.72      0.71        92

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]




In [24]:
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = SVC(kernel='linear', C=1.0, random_state=42, probability=True)

voting_clf2 = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2)], voting='soft')

voting_clf2.fit(x_train, y_train)

y_test_VC2 = voting_clf2.predict(x_test)
print(y_test_VC2)
print(y_test)

accuracy_VC2 = accuracy_score(y_test, y_test_VC2)
print("Accuracy:", accuracy_VC2)

print(classification_report(y_test, y_test_VC2))

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.7065217391304348
              precision    recall  f1-score   support

         0.0       0.69      0.60      0.64        40
         1.0       0.72      0.79      0.75        52

    accuracy                           0.71        92
   macro avg       0.70      0.69      0.70        92
weighted avg       0.70      0.71      0.70        92



In [25]:
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = SVC(random_state=42, probability=True)

voting_clf3 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)], voting='soft')

voting_clf3.fit(x_train, y_train)

y_test_VC3 = voting_clf3.predict(x_test)

accuracy_VC3 = accuracy_score(y_test, y_test_VC3)
print("Accuracy:", accuracy_VC3)

print(classification_report(y_test, y_test_VC3))
print(y_test_VC3)

Accuracy: 0.6956521739130435
              precision    recall  f1-score   support

         0.0       0.69      0.55      0.61        40
         1.0       0.70      0.81      0.75        52

    accuracy                           0.70        92
   macro avg       0.69      0.68      0.68        92
weighted avg       0.69      0.70      0.69        92

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]


In [26]:
from sklearn.ensemble import StackingClassifier 

stack1 = LogisticRegression(random_state=42)
stack2 = RandomForestClassifier(n_estimators=100, random_state=42)
stack3 = SVC(random_state=42, probability=True)
stack = StackingClassifier(estimators=[( 'lr' , stack1), ( 'RF' , stack2), ( 'SVC' , stack3)])

stack.fit(x_train, y_train) 
 
y_test_stack = stack.predict(x_test) 
 
accuracy_stack = accuracy_score(y_test, y_test_stack)
print("Accuracy:", accuracy_stack)

print(classification_report(y_test, y_test_stack))

Accuracy: 0.6847826086956522
              precision    recall  f1-score   support

         0.0       0.68      0.53      0.59        40
         1.0       0.69      0.81      0.74        52

    accuracy                           0.68        92
   macro avg       0.68      0.67      0.67        92
weighted avg       0.68      0.68      0.68        92



In [27]:
from sklearn.model_selection import KFold
models = [nb_classifier, knn_classifier, modelGB, modelRF, modelSVC, modelLR, xgb_clf, adaboost_clf, voting_clf1, voting_clf2, voting_clf3,stack]  # 假设这里有三个模型

# 定义 k 折交叉验证
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for model in models:
    print(f"Evaluating {model.__class__.__name__}...")

    accuracies = []
    for train_index, test_index in kfold.split(X_scaled, y_train1):
        x_train, x_test = X_filtered[train_index], X_filtered[test_index]
        y_train, y_test = y_train1[train_index], y_train1[test_index]

        
        model.fit(x_train, y_train)

        
        y_pred = model.predict(x_test)

       
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

  
    mean_accuracy = np.mean(accuracies)
    print(f"Average accuracy: {mean_accuracy:.2f}")

  

    print("\n")

Evaluating GaussianNB...
Average accuracy: 0.69


Evaluating KNeighborsClassifier...
Average accuracy: 0.51


Evaluating GradientBoostingClassifier...
Average accuracy: 0.68


Evaluating RandomForestClassifier...
Average accuracy: 0.68


Evaluating SVC...
Average accuracy: 0.68


Evaluating LogisticRegression...
Average accuracy: 0.68


Evaluating XGBClassifier...








Average accuracy: 0.64


Evaluating AdaBoostClassifier...
Average accuracy: 0.67


Evaluating VotingClassifier...




















Average accuracy: 0.68


Evaluating VotingClassifier...
Average accuracy: 0.69


Evaluating VotingClassifier...
Average accuracy: 0.70


Evaluating StackingClassifier...
Average accuracy: 0.57




In [34]:
def feature_selection(X, y):
    remain_list = []  # List of selected feature indices
    all_list = list(range(len(X[0])))  # Indices of all features
    max_iterations = 70  # Set maximum iterations
    iteration_count = 0 

    for idx in range(70):  # Stop after 70 iterations
        all_r2 = []
        for id, each in enumerate(all_list):
            print(f"{id}/{len(all_list)}", end='\r')  # Display current progress

            if each in remain_list:  # Skip if feature is already selected
                all_r2.append(-10)
                continue

            temp_remain_list = remain_list + [each]  # Add the current feature to the candidate list
            X_new = X[:, temp_remain_list]  # Use the selected feature subset
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            model = voting_clf3
            acc = cross_val_score(model, X_new, y, cv=kfold, n_jobs=-1).mean()  # Calculate accuracy
            all_r2.append(acc.mean())  # Add current feature's accuracy to the list

        max_id = np.argmax(all_r2)  # Index of the feature with the maximum accuracy
        remain_list.append(all_list[max_id])  # Add that feature to the selected list

        print(np.max(all_r2), remain_list)  # Print the maximum accuracy and selected features

        iteration_count += 1 

    return all_r2, remain_list

all_r2, y_in_removed_lists = feature_selection(X_filtered, y_train1)

0.601427815970386 [20]
0.6308831306187204 [20, 93]
0.66351136964569 [20, 93, 28]
0.6863564251718668 [20, 93, 28, 26]
0.706081438392385 [20, 93, 28, 26, 0]
0.725753569539926 [20, 93, 28, 26, 0, 325]
0.7355896351136965 [20, 93, 28, 26, 0, 325, 21]
0.7518773135906927 [20, 93, 28, 26, 0, 325, 21, 161]
0.755156002115283 [20, 93, 28, 26, 0, 325, 21, 161, 298]
0.7615547329455314 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36]
0.7713379164463248 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367]
0.7777895293495506 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1]
0.7844526705446854 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149]
0.8007403490216817 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318]
0.8040719196192491 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318, 82]
0.8106292966684293 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318, 82, 100]
0.8139079851930197 [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318, 82, 100, 62]
0.8204653622421999 [20, 

In [28]:
X_new = X_filtered[:, [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318, 82, 100, 62, 6, 12, 35, 208, 172, 44, 71, 101, 188, 225, 321, 374, 405, 104, 123, 198, 113, 68, 222, 116, 59, 119, 143, 7, 168, 175, 55, 194, 42, 3, 380, 138, 351, 65, 344, 8, 52, 58, 197, 18, 56]]
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(X_new, y_train1):
    x_train, x_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y_train1[train_index], y_train1[test_index]
print(train_index)
print(test_index)
train1=0
train0=0
for i in y_train:
    if i==1:
        train1+=1
    else:
        train0+=1
print(train1)
print(train0)

[243  39  84   1 134 217 300 273 276 185  73 262 122   5  46 174  36 158
  52 152 210 133 101 108   7  83 291 129 274 189 303  82 279  59  29  34
 164 252  43 254 192 136  51 200 183 227  20  15 261  23  25 105 205 302
 176 194 266 203  21 209 301 223 201 110  30 155 159 116 123 222 127 187
   8  67 139  41 290  79  85  44  93  71 268 154  10  90 204 295  69 141
  50 214 229 118 168 135  48 115 153  72 126 172  57 149 144   9  98  13
 207 211 246  74 213 267 245 240 146 182  64 226  53 173 143 293  33  75
   6 286 230 269  99  11  63  91 255  22 191  68  76  96 177  94 128  95
 239 150 138 218 145 250 297 117  16 280 188 119 169 249 131 281  24 219
  47 196 206 113 231 179  28  80 142 260 271 170 285 277 140  65  62 130
  60 107 292  78   3 208 258  17 157 106 298 299 181  42 114  32 263  27
  49 264 175 242 287 237  45 221 294 278 289 100  26  35 224 120]
[ 12  66 256 184 235   0 163 272 166 186 199 193 156 296  31 284  87 241
 167 102   4 215 178 234 305 282 304 202 165 228 162  97 2

In [29]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier(n_estimators=50, random_state=42)
modelRF.fit(x_train, y_train)

y_test_RF = modelRF.predict(x_test)
print(y_test_RF)
print(y_test)

accuracy_RF = accuracy_score(y_test, y_test_RF)
print("Accuracy:", accuracy_RF)

print(classification_report(y_test, y_test_RF))

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6956521739130435
              precision    recall  f1-score   support

         0.0       0.68      0.57      0.62        40
         1.0       0.71      0.79      0.75        52

    accuracy                           0.70        92
   macro avg       0.69      0.68      0.68        92
weighted avg       0.69      0.70      0.69        92



In [30]:
#SVC
from sklearn.svm import SVC

modelSVC = SVC(kernel='linear')

modelSVC.fit(x_train, y_train)

y_test_SVC = modelSVC.predict(x_test)
print(y_test_SVC)
print(y_test)

accuracy_SVC = accuracy_score(y_test, y_test_SVC)
print("Accuracy:", accuracy_SVC)

print(classification_report(y_test, y_test_SVC))

[0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6195652173913043
              precision    recall  f1-score   support

         0.0       0.56      0.60      0.58        40
         1.0       0.67      0.63      0.65        52

    accuracy                           0.62        92
   macro avg       0.62      0.62      0.62        92
weighted avg       0.62      0.62      0.62        92



In [31]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression()

modelLR.fit(x_train, y_train)

y_test_LR = modelLR.predict(x_test)
print(y_test_LR)
print(y_test)

accuracy_LR = accuracy_score(y_test, y_test_LR)
print("Accuracy:", accuracy_LR)

print(classification_report(y_test, y_test_LR))

[0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1.]
[1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
Accuracy: 0.6521739130434783
              precision    recall  f1-score   support

         0.0       0.62      0.53      0.57        40
         1.0       0.67      0.75      0.71        52

    accuracy                           0.65        92
   macro avg       0.65      0.64      0.64        92
weighted avg       0.65      0.65      0.65        92



In [32]:
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = SVC(random_state=42, probability=True)

voting_clf3 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)], voting='soft')

voting_clf3.fit(x_train, y_train)

y_test_VC3 = voting_clf3.predict(x_test)

accuracy_VC3 = accuracy_score(y_test, y_test_VC3)
print("Accuracy:", accuracy_VC3)

print(classification_report(y_test, y_test_VC3))
print(y_test_VC3)

Accuracy: 0.7391304347826086
              precision    recall  f1-score   support

         0.0       0.72      0.65      0.68        40
         1.0       0.75      0.81      0.78        52

    accuracy                           0.74        92
   macro avg       0.74      0.73      0.73        92
weighted avg       0.74      0.74      0.74        92

[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]


In [33]:
from joblib import dump, load
dump(voting_clf3, 'model_307.pkl')

['model_307.pkl']