In [1]:
from sklearn.cluster import KMeans
import numpy as np
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import cv2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Input, Dropout

In [3]:
#path
train = "C:/Users/ASUS/OneDrive/Documents/coolyeah/bryant/Data Science/hologyub/Penyisihan Hology Data Mining/train"
test = "C:/Users/ASUS/OneDrive/Documents/coolyeah/bryant/Data Science/hologyub/Penyisihan Hology Data Mining/test"

In [4]:
#load data 
train_csv = pd.read_csv("train.csv")
train_csv

Unnamed: 0,id,jenis,warna
0,1,1,1
1,2,0,2
2,3,1,3
3,4,1,1
4,5,0,4
...,...,...,...
772,773,0,1
773,774,0,1
774,775,1,3
775,776,0,0


In [5]:
import os
import cv2
import numpy as np

def load_images(image_folder, image_ids, img_size=(128,128)):
    images = []
    for img_id in image_ids:
        # Construct paths for both jpg and png formats
        img_path_jpg = os.path.join(image_folder, f"{img_id}.jpg")
        img_path_png = os.path.join(image_folder, f"{img_id}.png")
        
        # Check if the file exists in either format
        if os.path.exists(img_path_jpg):
            img = cv2.imread(img_path_jpg)
        elif os.path.exists(img_path_png):
            img = cv2.imread(img_path_png)
        else:
            print(f"Image {img_id} not found.")
            continue
        
        if img is not None:
            # Resize the image to the given size
            img = cv2.resize(img, img_size)
            images.append(img)
        else:
            print(f"Failed to load image {img_id}.")
    
    return np.array(images)


In [6]:
X_train = load_images(train,train_csv['id'],img_size=(128,128))

In [7]:
# Check for missing values
print("Missing values in each column:")
print(train_csv.isnull().sum())

# Print original shape
print("Original shape:", train_csv.shape)

# Check unique values and their counts
print("Unique values in 'jenis':")
print(train_csv['jenis'].value_counts())
print("Unique values in 'warna':")
print(train_csv['warna'].value_counts())

# Convert to categorical
y_jenis = to_categorical(train_csv['jenis'].values, num_classes=2)
y_warna = to_categorical(train_csv['warna'].values, num_classes=5)

# Print the shapes of the resulting arrays
print("y_jenis shape:", y_jenis.shape)  # Expected to be (number_of_samples, 2)
print("y_warna shape:", y_warna.shape)  # Expected to be (number_of_samples, 5)

Missing values in each column:
id       0
jenis    0
warna    0
dtype: int64
Original shape: (777, 3)
Unique values in 'jenis':
jenis
0    476
1    301
Name: count, dtype: int64
Unique values in 'warna':
warna
3    234
2    162
4    140
1    125
0    116
Name: count, dtype: int64
y_jenis shape: (777, 2)
y_warna shape: (777, 5)


In [8]:
# Example of how you might split your data
from sklearn.model_selection import train_test_split

X_train, X_val, y_train_jenis, y_val_jenis, y_train_warna, y_val_warna = train_test_split(
    X_train, y_jenis, y_warna, test_size=0.2, random_state=42
)


In [9]:
# Normalize image data
X_train = X_train / 255.0
X_val = X_val / 255.0

In [10]:
from keras.layers import BatchNormalization
from keras.layers import Conv2D

In [18]:
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.models import Model

# Step 1: Load the pre-trained DenseNet121 model without the top classification layer
def create_transfer_model(input_shape=(128, 128, 3), num_classes_jenis=2, num_classes_warna=5):
    # Make sure to include input layer properly
    inputs = Input(shape=input_shape)
    
    # Load DenseNet121, exclude top layer, keep weights from ImageNet
    base_model = DenseNet121(weights='imagenet', include_top=False, input_tensor=inputs)
    
    # Ensure the base model is correctly loaded
    if base_model is None:
        raise ValueError("Base model not loaded correctly. Check model structure.")
    
    # Freeze the layers of the base model so they don't get updated during training
    base_model.trainable = False
    
    # Add new layers on top of the pre-trained base
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)  # Prevent overfitting
    
    # Output layers for 'jenis' (2 classes) and 'warna' (5 classes)
    output_jenis = Dense(num_classes_jenis, activation='softmax', name='jenis_output')(x)
    output_warna = Dense(num_classes_warna, activation='softmax', name='warna_output')(x)
    
    # Combine into a new model (make sure the inputs and outputs are connected properly)
    model = Model(inputs=inputs, outputs=[output_jenis, output_warna])
    
    return model

# Step 2: Compile the model
model = create_transfer_model()
model.compile(optimizer='adam', 
              loss={'jenis_output': 'categorical_crossentropy', 'warna_output': 'categorical_crossentropy'},
              metrics={'jenis_output': 'accuracy', 'warna_output': 'accuracy'})

# Print model summary to check structure
model.summary()


In [19]:
model = create_transfer_model()

In [21]:
# Step 3: Compile the model before training
model.compile(optimizer='adam', 
              loss={'jenis_output': 'categorical_crossentropy', 'warna_output': 'categorical_crossentropy'},
              metrics={'jenis_output': 'accuracy', 'warna_output': 'accuracy'})

In [49]:
# Train the model
history = model.fit(X_train, {'jenis_output': y_train_jenis, 'warna_output': y_train_warna}, 
                    validation_data=(X_val, {'jenis_output': y_val_jenis, 'warna_output': y_val_warna}),
                    epochs=50, batch_size=64)

Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3s/step - jenis_output_accuracy: 1.0000 - jenis_output_loss: 2.4298e-04 - loss: 0.0014 - warna_output_accuracy: 1.0000 - warna_output_loss: 0.0011 - val_jenis_output_accuracy: 0.9423 - val_jenis_output_loss: 0.2669 - val_loss: 0.7135 - val_warna_output_accuracy: 0.8846 - val_warna_output_loss: 0.4616
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3s/step - jenis_output_accuracy: 1.0000 - jenis_output_loss: 0.0011 - loss: 0.0016 - warna_output_accuracy: 1.0000 - warna_output_loss: 5.5378e-04 - val_jenis_output_accuracy: 0.9423 - val_jenis_output_loss: 0.2733 - val_loss: 0.7280 - val_warna_output_accuracy: 0.8846 - val_warna_output_loss: 0.4650
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3s/step - jenis_output_accuracy: 1.0000 - jenis_output_loss: 4.0403e-04 - loss: 0.0011 - warna_output_accuracy: 1.0000 - warna_output_loss: 6.4672e-04 - val_jenis_outpu

In [50]:
# Load test images
test_image_ids = [i for i in range(778, 1112)]  # Test image IDs range from 778 to 1111
X_test_images = load_images(test, test_image_ids)

# Normalize test images
X_test_images = X_test_images / 255.0

# Predict on test data
predictions = model.predict(X_test_images)

# Extract predicted labels
pred_jenis = np.argmax(predictions[0], axis=1)
pred_warna = np.argmax(predictions[1], axis=1)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step


In [25]:
submission_csv = "C:/Users/ASUS/OneDrive/Documents/coolyeah/bryant/Data Science/hologyub/Penyisihan Hology Data Mining/sample_submission.csv"

In [51]:
# Load the sample submission CSV
submission_df = pd.read_csv(submission_csv)

# Fill in the predictions
submission_df['jenis'] = pred_jenis
submission_df['warna'] = pred_warna

# Save submission file
submission_df.to_csv('semogamenangpart4.csv', index=False)

In [106]:
def exact_match_ratio(y_true, y_pred):
    correct = np.sum(np.all(y_true == y_pred, axis=1))
    return correct / len(y_true)

In [107]:
# Assuming y_test_jenis and y_test_warna are the true labels for test set
# You can calculate Exact Match Ratio as:
y_test_jenis = pred_jenis
y_test_warna = pred_warna

y_true_combined = np.column_stack((y_test_jenis, y_test_warna))  # True labels
y_pred_combined = np.column_stack((pred_jenis, pred_warna))  # Predicted labels

emr = exact_match_ratio(y_true_combined, y_pred_combined)
print(f'Exact Match Ratio: {emr}')

Exact Match Ratio: 1.0


In [55]:

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def compare_csv_files(ml_predictions_path, manual_check_path):
    # Load both CSV files
    ml_df = pd.read_csv(ml_predictions_path)
    manual_df = pd.read_csv(manual_check_path,delimiter = ';')
    
    # Memastikan kedua DataFrame memiliki index yang sama
    ml_df = ml_df.sort_values('id').reset_index(drop=True)
    manual_df = manual_df.sort_values('id').reset_index(drop=True)
    
    # Hitung akurasi untuk setiap kolom
    for column in ['jenis', 'warna']:
        accuracy = accuracy_score(manual_df[column], ml_df[column])
        print(f"\nAkurasi untuk {column}: {accuracy*100:.2f}%")
        
        # Tampilkan classification report
        print(f"\nClassification Report untuk {column}:")
        print(classification_report(manual_df[column], ml_df[column]))
        
        # Tampilkan confusion matrix
        print(f"\nConfusion Matrix untuk {column}:")
        print(confusion_matrix(manual_df[column], ml_df[column]))
        
        # Tampilkan prediksi yang salah
        incorrect_predictions = ml_df[ml_df[column] != manual_df[column]]
        if len(incorrect_predictions) > 0:
            print(f"\nPrediksi yang salah untuk {column}:")
            for idx, row in incorrect_predictions.iterrows():
                print(f"ID: {row['id']}, Prediksi ML: {row[column]}, Manual Check: {manual_df.loc[idx, column]}")
    
    # Hitung jumlah total prediksi yang benar dan salah
    total_correct = sum((ml_df[column] == manual_df[column]).all() for column in ['jenis', 'warna'])
    total_predictions = len(ml_df) * 2  # karena ada 2 kolom
    
    print(f"\nTotal akurasi: {(total_correct/total_predictions)*100:.2f}%")
    
    # Detail perbedaan per baris
    differences = pd.DataFrame()
    differences['id'] = ml_df['id']
    for column in ['jenis', 'warna']:
        differences[f'{column}_ml'] = ml_df[column]
        differences[f'{column}_manual'] = manual_df[column]
        differences[f'{column}_match'] = ml_df[column] == manual_df[column]
    
    # Tampilkan hanya baris yang memiliki perbedaan
    differences_only = differences[~(differences['jenis_match'] & differences['warna_match'])]
    
    if len(differences_only) > 0:
        print("\nDetail perbedaan per baris:")
        print(differences_only)
    else:
        print("\nTidak ada perbedaan antara prediksi ML dan manual check!")
    
    return differences

# Gunakan fungsi
ml_predictions_path = 'C:/Users/ASUS/OneDrive/Documents/coolyeah/bryant/Data Science/hologyub/Penyisihan Hology Data Mining/semogamenangpart1.csv'  # Ganti dengan path file hasil ML
manual_check_path = 'C:/Users/ASUS/OneDrive/Documents/coolyeah/bryant/Data Science/hologyub/Penyisihan Hology Data Mining/submissions_100persen.csv'      # Ganti dengan path file manual check

differences = compare_csv_files(ml_predictions_path, manual_check_path)


Akurasi untuk jenis: 91.62%

Classification Report untuk jenis:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       206
           1       0.88      0.90      0.89       128

    accuracy                           0.92       334
   macro avg       0.91      0.91      0.91       334
weighted avg       0.92      0.92      0.92       334


Confusion Matrix untuk jenis:
[[191  15]
 [ 13 115]]

Prediksi yang salah untuk jenis:
ID: 834, Prediksi ML: 1, Manual Check: 0
ID: 864, Prediksi ML: 1, Manual Check: 0
ID: 871, Prediksi ML: 0, Manual Check: 1
ID: 876, Prediksi ML: 1, Manual Check: 0
ID: 879, Prediksi ML: 0, Manual Check: 1
ID: 904, Prediksi ML: 0, Manual Check: 1
ID: 908, Prediksi ML: 1, Manual Check: 0
ID: 937, Prediksi ML: 0, Manual Check: 1
ID: 944, Prediksi ML: 1, Manual Check: 0
ID: 949, Prediksi ML: 0, Manual Check: 1
ID: 953, Prediksi ML: 1, Manual Check: 0
ID: 967, Prediksi ML: 1, Manual Check: 0
ID: 980, Prediksi ML: 1, Man