In [1]:
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
from transformers import ConvNextModel, ConvNextImageProcessor
from config import CFG
from datasets import load_dataset
from PIL import Image

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import VotingClassifier
import time

2024-01-13 22:06:35.311899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
label_data = pd.read_csv(os.path.join(CFG.train_data_dir, '文件标签汇总数据.csv'))
train_csv_folder = os.path.join(CFG.train_data_dir, 'csv文件')
test_A_csv_folder = os.path.join(CFG.test_A_data_dir, 'csv文件')
train_image_folder = os.path.join(CFG.project_name, 'project/image/训练集数据')
test_A_image_folder = os.path.join(CFG.project_name, 'project/image/A榜测试集数据')
label2id = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}
id2label = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E"}

In [3]:
processor = ConvNextImageProcessor.from_pretrained(CFG.convNext_model)
model = ConvNextModel.from_pretrained(CFG.convNext_model)

In [4]:
X_train = []
y_train = []
for img in tqdm(os.listdir(train_image_folder)):
    img_path = os.path.join(train_image_folder, img)
    label = label_data[label_data['fileName'].str.contains(img[2:-4])]['defectType'].values[0]
    image = Image.open(img_path)
    # image 是单通道，需要转换成三通道
    image = np.array(image)
    image = np.stack([image, image, image], axis=-1)
    inputs = processor(image, return_tensors="pt")
    outputs = model(**inputs)
    X_train.append(outputs.pooler_output.squeeze().detach().numpy())
    y_train.append(label)
y_train = [label2id[i] for i in y_train]

 30%|███       | 35/115 [00:06<00:14,  5.55it/s]

100%|██████████| 115/115 [00:21<00:00,  5.33it/s]


In [5]:
X_test = []
name_test = []
test_name_list = os.listdir(test_A_csv_folder)
for img in tqdm(os.listdir(test_A_image_folder)):
    img_path = os.path.join(test_A_image_folder, img)
    image = Image.open(img_path)
    # image 是单通道，需要转换成三通道
    image = np.array(image)
    image = np.stack([image, image, image], axis=-1)
    inputs = processor(image, return_tensors="pt")
    outputs = model(**inputs)
    X_test.append(outputs.pooler_output.squeeze().detach().numpy())
    for name in test_name_list:
        if img[:-4] in name:
            name_test.append(name)
            break

    

100%|██████████| 115/115 [00:21<00:00,  5.37it/s]


In [6]:
# def weighted_f1(y_true, y_pred):
#     y_pred = np.argmax(y_pred.reshape(5, -1), axis=0)
#     f1 = f1_score(y_true, y_pred, average=None)
#     weights = np.array([0.1, 0.3, 0.2, 0.1, 0.3])
#     weighted_f1 = np.sum(f1 * weights)
#     return 'weighted_f1', weighted_f1, True

In [7]:
clf = GaussianNB()
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={'n_iter': 5000,
    'verbose': -1,
    'objective': 'multiclass',
    'num_class': 5,
    'learning_rate': 0.01, 
    'colsample_bytree': 0.78,
    'colsample_bynode': 0.8, 
    'lambda_l1': 5, 
    'lambda_l2': 3, 
    'min_data_in_leaf': 115, 
    'max_depth': 30, 
    'max_bin': 1000}
lgb=LGBMClassifier(**p6)
cat=CatBoostClassifier(iterations=3000,
                       verbose=0,
                       l2_leaf_reg=6.7,
                       learning_rate=0.005,
                       subsample = 0.4,
                       bootstrap_type='Bernoulli',
                       allow_const_label=True,
                       loss_function = 'MultiClass')

weights = [0.25,0.25,0.25,0.25]
print('Training...')
now = time.time()
ensemble = VotingClassifier(estimators=[('mnb',clf),
                                        ('sgd', sgd_model),
                                        ('lgb',lgb), 
                                        ('cat', cat)
                                        ],
                            weights=weights, 
                            voting='soft',
                            n_jobs=-1)
ensemble.fit(X_train, y_train)
print('Training time: ', time.time() - now)
print('Training finished')

Training...




Training time:  179.50280809402466
Training finished


In [8]:
print('Predicting...')
now = time.time()
y_pred = ensemble.predict(X_test)
print('Predicting time: ', time.time() - now)
print('Predicting finished')

Predicting...
Predicting time:  0.2310347557067871
Predicting finished


In [9]:
sub = pd.DataFrame(columns=['defectType', 'fileName'])
sub['fileName'] = name_test
sub['defectType'] = [id2label[i] for i in y_pred]
sub.to_csv('submission.csv', index=False)