In [None]:
import os
import cv2
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
# basic classification taskként van felfogva, megnézem még regresszióval, hátha az jobb
def load_data(file_path):
    images_folder = os.path.join(file_path, 'image_data')
    labels_path = os.path.join(file_path, 'train.csv')

    labels_df = pd.read_csv(labels_path)
    labels = labels_df['HeadCount'].values
    print(f'[LOADING] Loaded {len(labels)} labels from {labels_path}')
    return images_folder, labels

def embed_train_pictures(images_folder, labels, model):
    img_vectors = []
    for img_name, label in zip(os.listdir(images_folder), labels):
        img_path = os.path.join(images_folder, img_name)
        if model is not None:
            img_vector = model.fit_transform(preprocess_image(img_path).reshape(1, -1))
            img_vectors.append(img_vector)
        else:
            img_vector = preprocess_image(img_path).reshape(1, -1)
            img_vectors.append(img_vector)
    print(f'[EMBEDDING] Embedded {len(img_vectors)} training images.')
    return np.vstack(img_vectors) if img_vectors else np.array([])

def embed_test_pictures(images_folder, labels, model):
    img_vectors = []
    for img_name, label in zip(os.listdir(images_folder), labels):
        img_path = os.path.join(images_folder, img_name)
        if model is not None:
            img_vector = model.transform(preprocess_image(img_path).reshape(1, -1))
            img_vectors.append(img_vector)
        else:
            img_vector = preprocess_image(img_path).reshape(1, -1)
            img_vectors.append(img_vector)
    print(f'[EMBEDDING] Embedded {len(img_vectors)} testing images.')
    return np.vstack(img_vectors) if img_vectors else np.array([])

# basic preprocess, ide kell majd még több (pl. kontraszt, zajszűrés, etc.)
def preprocess_image(path, size=(128,128)):
    img = cv2.imread(path)
    if img is None:
        raise ValueError(f"Could not read image: {path}")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray_resized = cv2.resize(gray, size)
    return gray_resized.flatten()

#-------------------------------------------------------------------------------- 
# params
#-------------------------------------------------------------------------------- 

data_folder = 'C:\\Users\\csaka\\egyetem\\msc\\digikep2\\projekt_data\\train'

model = RandomForestClassifier(n_estimators=100, random_state=42)
vec_model = None #placeholder ha akarunk nagyobb modelleket használni erre

#-------------------------------------------------------------------------------- 

images_folder, labels = load_data(data_folder)

train_idx = int(0.8 * len(labels))
# X a képek, y a címkék
X_train, X_test = [], []
y_train, y_test = labels[:train_idx], labels[train_idx:]

X_train = embed_train_pictures(images_folder, y_train, vec_model)
X_test = embed_test_pictures(images_folder, y_test, vec_model)
print(f'[SPLITTING] Train/test split done!')

#skálázás helps
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('[SCALING] Feature scaling done!')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
baseline = int(labels.mean())

#baseline: 18% accuracy
#randomforest: 16% accuracy
print(f'Accuracy: {accuracy:.4f}')
print(f'Baseline (predicting mean {baseline}): {np.mean(y_test == baseline):.4f}')

[LOADING] Loaded 5733 labels from C:\Users\csaka\egyetem\msc\digikep2\projekt_data\train\train.csv
[EMBEDDING] Embedded 4586 training images.
[EMBEDDING] Embedded 1147 testing images.
[SPLITTING] Train/test split done!
[SCALING] Feature scaling done!
Accuracy: 0.1674
Baseline (predicting mean 4): 0.1813
