# Machine Learning - playground

In [0]:
#@title Zmienne pomocnicze { run: "auto", display-mode: "form" }
experiment_dir = "/home/reflex/reflex/data/results_all_2405/" #@param {type:"string"}
images_dir = "images" #@param {type:"string"}
csv_dir = "csv" #@param {type:"string"}
vectors_dir = "vectors" #@param {type:"string"}


### Deklaracje

In [0]:
import pandas as pd
import numpy as np
import cv2 as cv
import os
import re

### Vectors -> Dataframe

In [246]:
def create_joint_vector(image, directory, statistics=None, all_statistics=False, sort=False, 
                        sort_function=lambda x: x, dfcolumns=['image', 'values'], vector_length=None, 
                        fill_value=None, splitted_columns=False, image_as_index=False, predefined_file_list=None, 
                        remove_suffix=False):

  """
  Tworzy dataframe z wybranego zdjecia w postaci <nazwa_zdjecia> <wektor_zlaczonych statystyk>
  
  Params:
  --------------
    image                 - nazwa pliku ze zdjeciem (z suffixem (.SSSxSSS.png))
    directory             - katalog z katalogami ze wszystkimi statystykami
    statistics            - lista statystyk branych pod uwage
    all_statistics        - jezeli True to poprzedni parametr jest ignorowany i pod uwage bierzemy
                            wszystkie statystyki we wskazanym folderze
    sort                  - jezeli True to nazwy statystyk sortowane sa wedlug podanej funkcji
                            sortujacej (domyslnie leksykograficznie)
    sort_function         - funkcja (key) sortujaca statystyki (domyslnie leksykograficznie)
    dfcolumns             - list nazw kolumn w zwracanym dataframe'ie
    vector_length         - parametr okreslajacy pozadana dlugosc wektora. W przypadku 
                            nadmiaru jest przycinany, w przeciwnym razie jest wypelniniany
                            kolejnym paremetrem. Niezdefiniowany (None) nie modyfikuje wektora.
    fill_value            - wartosc, ktora wypelniany bedzie wektor, jezeli będzie za krótki
                            w przypadku zdefiniowania dlugości
    splited_columns       - zwracany dataframe jest w postaci [<image> <stat1> <stat2> ... <statn>]
                            nadal jednak pierwszy element wektora dfcolumns definuje nazwe 1. kolumny
    image_as_index        - ustawie nazwe pliku jako index DataFrame'u
    remove_suffix         - usuwa suffix (.SSSxSSS.png) z nazwy pliku
    
                     
   Returns:
   --------------
    None                  - w przpadku bledu (brak zdef. statystyk, zly katalog, zla nazwa pliku)
    Dataframe             - kiedy wszystko poszlo zgodnie z zalozeniami
                    
                      
  """
  
  # Normalizacja sciezki do foldery
  if directory[-1] != '/':
    directory += '/'
    
  # Sprawdzamy czy mamy jakiekolwiek statystyki do złączenia
  statistics_list = list(os.listdir(directory)) if all_statistics else statistics 

  if statistics_list:   

    # Aby uniknąć niedeterminizumu (?) w przypadku listdir sortujemy sortujemy statystyki leksykograficznie
    if sort:
      statistics_list.sort(key=sort_function)

    if splitted_columns:
      dfcolumns = [dfcolumns[0]] + statistics_list

    values = []
    for i in range(len(dfcolumns) - 1):
      values.append([])

    # Laczenie wektora
    for idx, stat in enumerate(statistics_list):

      current_vector = cv.imread(f'{directory}{stat}/{image}', cv.IMREAD_GRAYSCALE).flatten().tolist()

      # Dostosowywanie dlugosci wektora
      if vector_length:
        current_vector = current_vector[:vector_length]
        current_vector.extend([fill_value] * max(0, vector_length - len(current_vector)))

      if splitted_columns:
        values[idx] = current_vector
      else:
        values[0].extend(current_vector)

    # Tworzenie dataframe
    
    if remove_suffix:
      image = image[:-12]
      
    
    df = pd.DataFrame([[image, *values], columns=dfcolumns)

    # Ustawianie nowego indexu
    if image_as_index:
      df = df.set_index(dfcolumns[0])

    return df
      
  return None

SyntaxError: ignored

### Examples

In [184]:
image = 'APC5899_c4_2_0001.512x512.png'
directory = experiment_dir + vectors_dir
create_joint_vector(image, directory, all_statistics=True, sort=True, vector_length=200, splitted_columns=True, image_as_index=True)

Unnamed: 0_level_0,5th_percentile,95th_percentile,max,mean,median,min,var
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
APC5899_c4_2_0001.512x512.png,"[242, 240, 238, 238, 234, 211, 192, 189, 188, ...","[242, 242, 241, 242, 241, 241, 242, 241, 241, ...","[242, 242, 242, 242, 242, 242, 242, 243, 242, ...","[242, 241, 240, 240, 239, 235, 233, 228, 226, ...","[242, 241, 240, 240, 240, 240, 240, 239, 240, ...","[242, 240, 238, 236, 232, 208, 188, 186, 185, ...","[0, 0, 1, 2, 7, 98, 233, 255, 255, 255, 255, 2..."


In [185]:
image = 'APC5899_c4_2_0001.512x512.png'
directory = experiment_dir + vectors_dir
create_joint_vector(image, directory, statistics=['max', 'mean', 'min', 'var'], sort=False, vector_length=200, splitted_columns=True, remove_suffix=True, image_as_index=False)

Unnamed: 0,image,max,mean,min,var
0,APC5899_c4_2_0001,"[242, 242, 242, 242, 242, 242, 242, 243, 242, ...","[242, 241, 240, 240, 239, 235, 233, 228, 226, ...","[242, 240, 238, 236, 232, 208, 188, 186, 185, ...","[0, 0, 1, 2, 7, 98, 233, 255, 255, 255, 255, 2..."


In [186]:
image = 'APC5899_c4_2_0001.512x512.png'
directory = experiment_dir + vectors_dir
create_joint_vector(image, directory, all_statistics=True, dfcolumns=['image', 'HELLOWORLD'], sort=False, vector_length=10, splitted_columns=False, image_as_index=True)

Unnamed: 0_level_0,HELLOWORLD
image,Unnamed: 1_level_1
APC5899_c4_2_0001.512x512.png,"[0, 0, 1, 2, 7, 98, 233, 255, 255, 255, 242, 2..."


### All files as dataframe

In [0]:
def vector_folder_to_df(directory, limit=None):
  
  files = list(os.listdir(directory + '/' + list(os.listdir(directory))[0]))
  all_dfs = []

  for idx, image in enumerate(files[:limit]): 
    all_dfs.append(create_joint_vector(image, directory, all_statistics=True, sort=True, vector_length=200, splitted_columns=True, remove_suffix=True, image_as_index=True))
    
  return pd.concat(all_dfs)

In [245]:
vdir = experiment_dir + vectors_dir
vector_folder_to_df(vdir, limit=50)

Unnamed: 0_level_0,5th_percentile,95th_percentile,max,mean,median,min,var
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mfy3-1_7_101,"[0, 0, 0, 0, 0, 0, 0, 6, 50, 62, 72, 75, 78, 8...","[0, 201, 208, 205, 194, 174, 187, 189, 157, 16...","[0, 201, 212, 212, 208, 202, 208, 211, 211, 21...","[0, 50, 71, 66, 53, 65, 68, 83, 88, 93, 94, 94...","[0, 0, 19, 13, 17, 62, 69, 80, 82, 86, 89, 88,...","[0, 0, 0, 0, 0, 0, 0, 0, 19, 52, 60, 71, 75, 7...","[0, 255, 255, 255, 255, 255, 255, 255, 255, 25..."
121183_2_E2_001,"[251, 246, 245, 245, 242, 221, 190, 167, 168, ...","[251, 247, 249, 250, 249, 249, 249, 250, 248, ...","[251, 247, 250, 251, 251, 249, 251, 250, 251, ...","[251, 247, 247, 247, 246, 239, 234, 220, 216, ...","[251, 247, 247, 247, 247, 245, 246, 242, 230, ...","[251, 246, 245, 243, 239, 197, 172, 153, 162, ...","[0, 0, 1, 3, 7, 148, 255, 255, 255, 255, 255, ..."
auw1_16_1_001,"[194, 188, 0, 169, 167, 147, 126, 87, 73, 86, ...","[194, 218, 192, 194, 194, 194, 195, 195, 193, ...","[194, 218, 192, 202, 195, 195, 199, 195, 195, ...","[194, 197, 168, 186, 180, 178, 180, 173, 171, ...","[194, 190, 187, 187, 188, 186, 188, 188, 188, ...","[194, 188, 0, 165, 0, 142, 90, 0, 0, 0, 56, 10...","[0, 152, 255, 65, 255, 255, 255, 255, 255, 255..."
nsa3_16_1_001,"[0, 0, 0, 0, 0, 24, 13, 4, 60, 93, 111, 113, 1...","[0, 0, 209, 132, 130, 130, 130, 135, 137, 140,...","[0, 0, 209, 200, 209, 142, 139, 138, 148, 151,...","[0, 0, 79, 64, 81, 93, 98, 109, 116, 121, 131,...","[0, 0, 54, 68, 87, 104, 112, 125, 125, 129, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101, 108, 114, ...","[0, 0, 255, 255, 255, 255, 255, 255, 255, 255,..."
125749_3_001,"[251, 249, 248, 249, 247, 244, 231, 193, 121, ...","[251, 251, 252, 251, 251, 251, 251, 250, 249, ...","[251, 251, 252, 252, 252, 251, 251, 251, 253, ...","[251, 250, 250, 250, 249, 248, 244, 229, 206, ...","[251, 250, 250, 250, 250, 248, 247, 239, 222, ...","[251, 249, 248, 249, 247, 242, 225, 189, 109, ...","[0, 0, 1, 0, 2, 8, 53, 255, 255, 255, 255, 255..."
120358_1_E1_001,"[0, 240, 237, 237, 238, 238, 233, 224, 208, 17...","[0, 242, 245, 245, 247, 245, 246, 245, 244, 24...","[0, 242, 245, 247, 250, 250, 250, 251, 248, 24...","[0, 241, 241, 241, 242, 242, 241, 238, 233, 22...","[0, 241, 241, 241, 242, 241, 241, 240, 240, 23...","[0, 240, 237, 237, 237, 238, 231, 216, 186, 17...","[0, 1, 5, 6, 8, 8, 14, 48, 174, 255, 255, 255,..."
czx8-9,"[226, 221, 214, 213, 216, 215, 216, 211, 211, ...","[226, 239, 235, 238, 235, 232, 232, 234, 233, ...","[226, 239, 235, 239, 237, 233, 233, 237, 237, ...","[226, 230, 227, 226, 225, 223, 223, 224, 224, ...","[226, 230, 228, 226, 223, 223, 223, 224, 226, ...","[226, 221, 214, 212, 214, 214, 214, 207, 207, ...","[0, 40, 32, 48, 37, 30, 26, 55, 63, 70, 99, 11..."
27025_2_001,"[251, 249, 247, 249, 242, 223, 199, 162, 151, ...","[251, 251, 249, 251, 251, 251, 251, 250, 250, ...","[251, 251, 249, 252, 251, 252, 251, 252, 251, ...","[251, 250, 248, 249, 248, 242, 237, 221, 211, ...","[251, 250, 248, 249, 249, 248, 248, 240, 223, ...","[251, 249, 247, 248, 240, 212, 191, 156, 149, ...","[0, 0, 1, 1, 7, 108, 255, 255, 255, 255, 255, ..."
zxp4-9,"[243, 236, 216, 187, 161, 150, 149, 146, 128, ...","[243, 245, 240, 242, 241, 241, 241, 240, 242, ...","[243, 245, 241, 242, 241, 242, 241, 242, 242, ...","[243, 241, 234, 227, 217, 204, 201, 196, 195, ...","[243, 242, 238, 234, 236, 230, 212, 186, 178, ...","[243, 236, 216, 169, 156, 149, 147, 138, 112, ...","[0, 10, 63, 255, 255, 255, 255, 255, 255, 255,..."
iqo4-7_B,"[0, 238, 236, 238, 237, 237, 237, 236, 234, 22...","[0, 243, 243, 243, 243, 243, 243, 243, 243, 24...","[0, 243, 243, 243, 246, 243, 244, 246, 244, 24...","[0, 241, 241, 241, 240, 240, 240, 240, 239, 23...","[0, 241, 242, 241, 241, 241, 241, 240, 238, 23...","[0, 238, 236, 237, 234, 237, 234, 236, 232, 22...","[0, 4, 5, 3, 6, 3, 4, 6, 8, 22, 128, 145, 210,..."
