In [1]:
#data augmentation for -1 and 1 labels

from collections import Counter
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# Adjust the path to the file location in your Google Drive
data_path = 'chess_images/prepared_data/hog_svm_data.npy'

# Load the prepared data
data = np.load(data_path, allow_pickle=True).item()


# Extract features and labels
X = data["features"]
y = data["labels"]


def augment_features(X, y, target_label, num_samples):
    augmented_X, augmented_y = [], []
    for i, (features, label) in enumerate(zip(X, y)):
        if label == target_label:
            for _ in range(num_samples):
                # Add small noise to features
                noisy_features = features + np.random.normal(0, 0.01, size=features.shape)
                augmented_X.append(noisy_features)
                augmented_y.append(label)
    return np.vstack([X, np.array(augmented_X)]), np.hstack([y, np.array(augmented_y)])

# Augment samples for label -1 and 1
X_augmented, y_augmented = augment_features(X, y, target_label=-1, num_samples=2)
X_augmented, y_augmented = augment_features(X_augmented, y_augmented, target_label=1, num_samples=2)
X_augmented, y_augmented = augment_features(X_augmented, y_augmented, target_label=0, num_samples=2)

# Check the new label distribution
print(Counter(y_augmented))
print(X_augmented.shape)



Counter({0: 105600, -1: 52800, 1: 52800})
(211200, 1296)


In [2]:
import numpy as np
import joblib
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Load dataset
# data = np.load("chess_images/prepared_data/hog_svm_data.npy", allow_pickle=True).item()
# X, y = data["features"], data["labels"]

# Define models
models = {
    # "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
    "LightGBM": LGBMClassifier(),
    # "CatBoost": CatBoostClassifier(verbose=0),
    # "Extra Trees": ExtraTreesClassifier(n_estimators=100),
    # "KNN": KNeighborsClassifier(n_neighbors=5),
    # "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

# Use Stratified K-Fold (better for classification tasks)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for each model and save the best one
best_model = None
best_score = 0

for name, model in models.items():
    scores = cross_val_score(model, X_augmented, y_augmented, cv=kfold, scoring="accuracy")
    mean_score = scores.mean()
    print(f"{name}: Mean Accuracy = {mean_score:.4f}, Std Dev = {scores.std():.4f}")

    # Train model on the full dataset before saving
    model.fit(X_augmented, y_augmented)
    model_path = f"chess_images/prepared_data/{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, model_path)
    print(f"Saved {name} model to {model_path}")

    # Track the best model
    # if mean_score > best_score:
    #     best_score = mean_score
    #     best_model = model

# Save the best model separately
# best_model_path = "chess_images/prepared_data/best_model.pkl"
# joblib.dump(best_model, best_model_path)
# print(f"Best model saved to {best_model_path}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.865139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 168960, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.921471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 168960, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.942140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 168960, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.872748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 168960, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.876293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 168960, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294




LightGBM: Mean Accuracy = 0.9994, Std Dev = 0.0001
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.404004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 330480
[LightGBM] [Info] Number of data points in the train set: 211200, number of used features: 1296
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294
Saved LightGBM model to chess_images/prepared_data/lightgbm_model.pkl


In [3]:
#loading the model and testing input as 480*480 image and output as bw_fen file 2222222222222222
import pandas as pd
import joblib
import cv2
import numpy as np
from skimage.feature import hog

# Load the trained model
model = joblib.load('chess_images/prepared_data/lightgbm_model.pkl')

def extract_hog_features(image):
    image = cv2.resize(image, (60, 60))
    fd, _ = hog(image, orientations=9, pixels_per_cell=(8, 8),
                cells_per_block=(2, 2), visualize=True)
    
    # Convert features to a DataFrame with column names matching training data
    feature_names = [f'feature_{i}' for i in range(len(fd))]  # Create feature names
    return pd.DataFrame([fd], columns=feature_names)  # Wrap in DataFrame

def split_and_predict_fen(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (480, 480))
    step = 60
    label_mapping_inverse = {1: "w", -1: "b", 0: "1"}  # FEN compatible labels
    predictions = []

    for row in range(8):
        row_data = []
        for col in range(8):
            square = image[row * step:(row + 1) * step, col * step:(col + 1) * step]
            features_df = extract_hog_features(square)  # Get features as DataFrame
            prediction = model.predict(features_df)[0]  # Pass DataFrame instead of list
            row_data.append(label_mapping_inverse[prediction])
        # Convert row data to FEN row format
        fen_row = ''.join(row_data)
        # Consolidate empty squares into numbers
        compact_fen_row = ''
        count = 0
        for char in fen_row:
            if char == '1':
                count += 1
            else:
                if count > 0:
                    compact_fen_row += str(count)
                    count = 0
                compact_fen_row += char
        if count > 0:
            compact_fen_row += str(count)
        predictions.append(compact_fen_row)

    # Join rows with '/' for the final FEN
    fen_result = '/'.join(predictions)
    return fen_result

# Example usage:
image_path = 'chess_images/test1.jpg'
fen_output = split_and_predict_fen(image_path)
print("Generated FEN:", fen_output)

# Optionally, save the FEN to a file
with open("generated_bw_fen.fen", "w") as file:
    file.write(fen_output)


Generated FEN: 2bwww2/ww4ww/wwwwww1w/3bbb2/3bbw2/w6b/bbbbbbbb/8


In [5]:
import pandas as pd
import joblib
import cv2
import numpy as np
from skimage.feature import hog

# Load the trained model
model = joblib.load('chess_images/prepared_data/lightgbm_model.pkl')

def extract_hog_features(image):
    image = cv2.resize(image, (60, 60))
    fd, _ = hog(image, orientations=9, pixels_per_cell=(8, 8),
                cells_per_block=(2, 2), visualize=True)
    feature_names = [f'feature_{i}' for i in range(len(fd))]
    return pd.DataFrame([fd], columns=feature_names)

def split_and_predict_fen_with_overlay(image_path, output_path='chess_images/overlay_result.jpg'):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    color_image = cv2.imread(image_path)  # For overlay
    image = cv2.resize(image, (480, 480))
    color_image = cv2.resize(color_image, (480, 480))

    step = 60
    label_mapping_inverse = {1: "w", -1: "b", 0: "1"}

    predictions = []

    for row in range(8):
        row_data = []
        for col in range(8):
            square = image[row * step:(row + 1) * step, col * step:(col + 1) * step]
            features_df = extract_hog_features(square)
            prediction = model.predict(features_df)[0]
            label = label_mapping_inverse[prediction]

            row_data.append(label)

            if label in ['w', 'b']:
                text_color = (255, 255, 255) if label == 'w' else (0, 0, 0)
                cv2.putText(color_image, label, (col * step + 20, row * step + 40),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2, cv2.LINE_AA)

        fen_row = ''.join(row_data)
        compact_fen_row = ''
        count = 0
        for char in fen_row:
            if char == '1':
                count += 1
            else:
                if count > 0:
                    compact_fen_row += str(count)
                    count = 0
                compact_fen_row += char
        if count > 0:
            compact_fen_row += str(count)

        predictions.append(compact_fen_row)

    fen_result = '/'.join(predictions)
    print("Generated FEN:", fen_result)

    # Save FEN to file
    with open("generated_bw_fen.fen", "w") as file:
        file.write(fen_result)

    # Save overlay image
    cv2.imwrite(output_path, color_image)

# Example usage
image_path = 'chess_images/test2.jpg'
split_and_predict_fen_with_overlay(image_path)


Generated FEN: 2wwbwwb/wwb1bww1/wwwbww1b/2w5/1b1w1b2/w2ww3/wb1wbb2/6b1
