In [1]:
import numpy as np
import pandas as pd

import os
import warnings
import datetime
from collections import namedtuple

from IPython.display import IFrame

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib.cm import ScalarMappable
import seaborn as sns


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import parallel_backend


import cv2 

from scipy.stats import kstest, norm, mannwhitneyu, f_oneway

In [2]:
RANDOM_SEED = 1337

DATA_PATH = os.path.join(os.pardir, "data")  # Path to csv data
INFO_PATH = os.path.join(os.pardir, "info")  # Path to problem info
RESULTS_PATH = os.path.join(os.pardir, "results")  # Path to results files

In [3]:
def load_images_into_df(path, dict_df, df, columns):
    
    for dir in os.listdir(path):
        size=0
        for file in filter(lambda x: x.endswith(".jpg"), os.listdir(os.path.join(path, dir))):
            dict_df[str(df)] = dict_df[str(df)].append({columns[0]: plt.imread(os.path.join(path, dir, file)), columns[1]: classes.get(str(dir))}, ignore_index=True)
            size+=1
        print(f"{size} images read from {dir} in {os.path.basename(path)} folder")

In [4]:
dataframes = {}

columns = ["img", "classification"]
classes = {'edificios': 0, 'bosques': 1, 'glaciares': 2, 'montanas': 3, 'mares': 4, 'calles': 5 }


train = os.path.join(DATA_PATH, "train")
test = os.path.join(DATA_PATH, "test")

dataframes["train"] = pd.DataFrame(columns=columns)
dataframes["test"] = pd.DataFrame(columns=columns)

load_images_into_df(train, dataframes, "train", columns)
load_images_into_df(test, dataframes, "test", columns)

2271 images read from bosques in train folder
2382 images read from calles in train folder
2191 images read from edificios in train folder
2404 images read from glaciares in train folder
2274 images read from mares in train folder
2512 images read from montanas in train folder
474 images read from bosques in test folder
501 images read from calles in test folder
437 images read from edificios in test folder
553 images read from glaciares in test folder
510 images read from mares in test folder
525 images read from montanas in test folder


In [5]:
dataframes["train"][dataframes["train"]["classification"] == 3]

Unnamed: 0,img,classification
11522,"[[[251, 190, 125], [251, 190, 125], [251, 190,...",3
11523,"[[[0, 102, 195], [0, 102, 195], [0, 102, 195],...",3
11524,"[[[48, 117, 192], [49, 118, 193], [48, 117, 19...",3
11525,"[[[113, 161, 223], [115, 163, 225], [116, 164,...",3
11526,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",3
...,...,...
14029,"[[[156, 166, 191], [158, 168, 193], [162, 170,...",3
14030,"[[[175, 213, 252], [171, 209, 248], [179, 217,...",3
14031,"[[[196, 195, 209], [205, 204, 218], [202, 201,...",3
14032,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",3
