In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
import json

In [2]:
folder_path = Path("../Bat_Orientation_Calls")

image_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".png")]

for image_path in image_paths:
    image = Image.open(image_path)

    base_width = 128 # might change, since 
    aspect_ratio = image.size[0] / image.size[1]  # Width / Height

    new_width = base_width
    new_height = int(new_width / aspect_ratio)

    resized_image = image.resize((new_width, new_height)).quantize(colors=64)


    image_name = Path(image_path).name
    resized_image.save(f"./compressed_pictures/{image_name}")

In [54]:
def classes_csv_to_df(file_paths: list, delimiter=";") -> pd.DataFrame:
    df_all = pd.DataFrame()
    for file_path in file_paths:
        df_current = pd.read_csv(file_path, delimiter=delimiter)
        df_all = pd.concat([df_all, df_current])

    df_all = df_all.reset_index(drop=True)
    df_all = df_all.drop_duplicates()
    df_all.drop("Filename", axis=1, inplace=True)
    df_all = remove_unwanted_datapoints(df_all)
    df_all.replace("&Mausohr ", "Mausohr", inplace=True)
    df_all = remove_less_sample_classes(df_all, 60)
    
    #df_all.drop("Species", axis=1, inplace=True)
    return df_all

def categorical_classes(df: pd.DataFrame, column_name: str):
    # df passed as call by reference
    df_copy = df.copy()
    df_copy[column_name] = df_copy[column_name].astype('category')
    return df_copy

def numerical_classes(df: pd.DataFrame, column_name: str) -> tuple[pd.DataFrame, dict[int|str]]:
    df_copy = df.copy()
    df_copy[column_name] = df_copy[column_name].astype('category')
    class_mapping = dict(enumerate(df_copy[column_name].cat.categories))
    df_copy[column_name] = df_copy[column_name].cat.codes
    return df_copy, class_mapping

def encode_classes(df: pd.DataFrame, column_name: str):
    encoded_classes = pd.get_dummies(df[column_name])
    df = df.join(encoded_classes)
    return df

def remove_unwanted_datapoints(df: pd.DataFrame) -> pd.DataFrame:
    ids = set()
    duplicated_ids = set()
    #duplicates = list()
    for row in df.iterrows():
        id = row[1]["ID"]
        if id in ids:
            duplicated_ids.add(id)
            #duplicates.append(row)
        ids.add(id)

    # we drop than since they don't give us information in our trainingsprocess
    #print(len(df[df["Schwarzbild"] == 1]))
    #print(len(df[df["Fledermaus nicht bestimmbar"] == 1]))
    # since this are just 9 we just drop them
    # print(len(duplicates))
    return df[~((df['ID'].isin(duplicated_ids)) |
            (df['Species'].isin(['Fledermaus nicht bestimmbar', 'Schwarzbild'])))]\
            .reset_index(drop=True)

def remove_less_sample_classes(df: pd.DataFrame, min_samples: int) -> pd.DataFrame:
    class_distribution = df['Species'].value_counts()
    valid_classes = class_distribution[class_distribution >= min_samples].index
    return df[df['Species'].isin(valid_classes)]

def class_mapping_to_csv(class_mapping_dict: dict) -> None: 
    with open('data/class_mapping.csv', 'w') as class_mapping_csv:  
        writer = csv.writer(class_mapping_csv)
        for key, value in class_mapping_dict.items():
            writer.writerow([key, value])

def plot_class_distribution(df: pd.DataFrame):
    fig = plt.figure(figsize=(6,6))
    df['Species'].value_counts().plot.bar()
    plt.title('Count Distribution')
    plt.savefig("data/class_distribution")
    plt.close(fig)

df = classes_csv_to_df(["../Auswertung_20220524.csv","../LMU_20180326_class.csv", "../LMU_20180505_classified.csv"])
# RAM is cheaper than salary ;)
#df_categorical = categorical_classes(df, "Species")
df_numerical, class_mapping = numerical_classes(df, "Species")
#df_encoded = encode_classes(df, "Species")
class_mapping_to_csv(class_mapping)
plot_class_distribution(df_numerical)

def get_classes_from_id(id: int, df: pd.DataFrame) -> pd.Series:
    for row in df.iterrows():
        if id == row[1]["ID"]:
            return row[1].drop("ID")

def calc_black_frame(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _,thresh = cv2.threshold(gray,1,255,cv2.THRESH_BINARY)
    contours,_ = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    cnt = contours[0]
    x,y,w,h = cv2.boundingRect(cnt)
    return (y, y+h, x, x+w)

def remove_black_frame(img, frame):
    return img[frame[0]:frame[1], frame[2]:frame[3]]

def load_images_from_folder(folder_path: str, df_categorical=pd.DataFrame(), df_numerical=pd.DataFrame(), df_encoded=pd.DataFrame()) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    if not df_categorical.empty and not df_numerical.empty and not df_encoded.empty:
        raise ValueError("You have to define at least one Dataframe.")
    images_categorical = list()
    images_numerical = list()
    images_encoded = list()
    column_names = ["data", "Species"]
    column_names_encoded = ["data"] + list(df_encoded.columns)[1:]
    for i, filename in enumerate(os.listdir(folder_path)):
        img_path = os.path.join(folder_path, filename)
        if os.path.isfile(img_path):  
            img = cv2.imread(img_path)
            # some images are broken
            if img is not None:
                if i == 0:
                    frame = (10, 75, 16, 116) #calc_black_frame(img) # hardcoded for now, cause mask size depnds on loaded image
                img = remove_black_frame(img, frame)
                if i == 0:
                    print("The final image shape/size[hwc] is:",img.shape)
                    # store image shape in file
                    with open("./data/meta.json","w+") as file:
                        file.write(json.dumps({"h":img.shape[0],"w":img.shape[1],"c":img.shape[2]}))
                class_categorical = None
                class_numerical = None
                class_encoded = None
                if not df_categorical.empty:
                    class_categorical = get_classes_from_id(int(filename[:-4]), df_categorical)
                if not df_numerical.empty:
                    class_numerical = get_classes_from_id(int(filename[:-4]), df_numerical)
                if not df_encoded.empty:
                    class_encoded = get_classes_from_id(int(filename[:-4]), df_encoded)
                # need to check, if the class of the image is not null [aka. image would be one of the unwanted datapoints (e.g. class Schwarzbild)]
                if (class_categorical is not None) or (class_numerical is not None) or (class_encoded is not None):
                    if not df_categorical.empty:
                        images_categorical.append([img.flatten(), *class_categorical.values])
                    if not df_numerical.empty:
                        images_numerical.append([img.flatten(), *class_numerical.values])
                    if not df_encoded.empty:
                        images_encoded.append([img.flatten(), *class_encoded.values])

    #print(images_categorical)
    #print(np.array(images_categorical, dtype=object).shape)

    return_dfs = dict()

    if not df_categorical.empty:
        return_dfs["df_categorical"] = pd.DataFrame(np.array(images_categorical, dtype=object), columns=column_names)
    if not df_numerical.empty:
        return_dfs["df_numerical"] = pd.DataFrame(np.array(images_numerical, dtype=object), columns=column_names)
    if not df_encoded.empty:
        return_dfs["df_encoded"] = pd.DataFrame(np.array(images_encoded, dtype=object), columns=column_names_encoded).drop("Species", axis=1)

    return return_dfs

dfs = load_images_from_folder("./compressed_pictures/", df_numerical=df_numerical)
for df_name, df in dfs.items():
    #print(images_categorical["data"].to_numpy().shape)
    if df_name == "df_categorical":
        dfs["df_categorical"].to_pickle("./data/images_df_categorical.pkl")
    if df_name == "df_numerical":
        dfs["df_numerical"].to_pickle("./data/images_df_numerical.pkl")
    if df_name == "df_encoded":
        dfs["df_encoded"].to_pickle("./data/images_df_encoded.pkl")

The final image shape/size[hwc] is: (65, 100, 3)
0: ./compressed_pictures/1524994153.png -> (85, 128, 3) to (65, 100, 3)
1: ./compressed_pictures/1521963463.png -> (85, 128, 3) to (65, 100, 3)
2: ./compressed_pictures/1525038936.png -> (85, 128, 3) to (65, 100, 3)
3: ./compressed_pictures/1525084171.png -> (85, 128, 3) to (65, 100, 3)
4: ./compressed_pictures/1521800852.png -> (85, 128, 3) to (65, 100, 3)
5: ./compressed_pictures/1507315168.png -> (85, 128, 3) to (65, 100, 3)
6: ./compressed_pictures/1524843179.png -> (85, 128, 3) to (65, 100, 3)
7: ./compressed_pictures/1521921122.png -> (85, 128, 3) to (65, 100, 3)
8: ./compressed_pictures/1521927235.png -> (85, 128, 3) to (65, 100, 3)
9: ./compressed_pictures/1507516148.png -> (85, 128, 3) to (65, 100, 3)
10: ./compressed_pictures/1524527738.png -> (85, 128, 3) to (65, 100, 3)
11: ./compressed_pictures/1525037387.png -> (85, 128, 3) to (65, 100, 3)
12: ./compressed_pictures/1521934507.png -> (85, 128, 3) to (65, 100, 3)
13: ./compre

In [None]:
# 1. 1524994153.png -> (10, 75, 15, 116) to (65, 101, 3)
# 2. 1521963463.png -> (10, 75, 16, 116) to (65, 100, 3)