# Convert python to ORC (big data) files

## Install packages

In [1]:
import subprocess

########## TO DO, à mettre dans un fichier utils.py ##########

# fonction utilitaire pour exécuter puis lire une console command
def run_and_print_consol_command(command):

    byte_output = subprocess.run(command, stdout=subprocess.PIPE).stdout

    str_output = str(byte_output)

    clean_output = str_output[2:-1].replace('\\r', '').replace('\\n', '\n')

    print(clean_output)

def install_package(package_name: str, version: str = None):
    if version :
        command = 'py -m pip install ' + package_name + "==" + version
        run_and_print_consol_command(command)
        
    else :
        command = 'py -m pip install ' + package_name
        run_and_print_consol_command(command)

In [2]:
## Tests

# run_and_print_consol_command('pip freeze')
install_package('pandas==1.4.0 tk==0.1.0 pyorc==0.5.0')





## Imports

In [1]:
import tkinter as tk
import tkinter.filedialog as fd

import pandas as pd

import pyorc

from os import listdir, mkdir
import shutil

from datetime import datetime

## Back end pyorc

In [4]:
class CsvToOrc():

    def __init__(self):
        pass
    
    def create_orc_folder(self, csv_folder):
        curr_dir = csv_folder.split('/')[-1]
        root_path = csv_folder.replace(f'/{curr_dir}','')
        try:
            mkdir(f'{root_path}/orc')

        except:
            print('WARNING:     orc folder is already existing')
            print('WARNING:     it will be removed and recreated')
            shutil.rmtree(f'{root_path}/orc')
            mkdir(f'{root_path}/orc')

        return f'{root_path}/orc/'



    def to_ocr(self, folder_path: str):
        
        orc_folder = self.create_orc_folder(folder_path)

        filesnames = listdir(folder_path)
        files = [f_name for f_name in filesnames if f_name.endswith('.csv')]

        print(f"INFO:        Files to create: {' | '.join(files)}")

        for file in files:

            print(f"########## Creation of {file}.orc ##############")

            file_path = f"{folder_path}/{file}"
            print(f"PATH:        {file_path}")
            
            try:
                df = pd.read_csv(file_path, delimiter = ';')
            except:
                print('Error:   CSV file encoding is not utf8')
                # df = pd.read_csv(file_path, delimiter = ';', encoding='iso-8859-1')

            df = df.astype(str)

            columns = df.columns
            col_schema = ','.join([f"{i}:string" for i in columns])

            schema =f"struct<{col_schema}>"
            print(f'SCHEMA:      {schema}')

            output_path = f"{orc_folder}{file}.orc"
            output = open(output_path, 'wb')
            writer = pyorc.Writer(output, schema)
            try:
                t1 = datetime.now()

                for i in range(len(df)):
                    writer.write(tuple(df.iloc[i].values.tolist()))
                writer.close()
                t2 = datetime.now()
                print(f'INFO:       OK')
                print(f'INFO:       Time to write: {t2 - t1} s')
            except: 
                print(f'ERROR:      KO, could not create {output_path}')

convertor = CsvToOrc()

convertor.to_ocr('C:/Users/Administrateur/Desktop/Formation python Thibault Briand/py_to_orc')

INFO:        Files to create: Prenoms.csv
########## Creation of Prenoms.csv.orc ##############
PATH:        C:/Users/Administrateur/Desktop/Formation python Thibault Briand/py_to_orc/Prenoms.csv
SCHEMA:      struct<01_prenom:string,02_genre:string,03_langage:string,04_frequence:string>
INFO:       OK
INFO:       Time to write: 0:00:01.057996 s


## Front end tkinter

In [5]:
class CsvToOrcIhm():

    def __init__(self):
        self.__app = tk.Tk()
        self.app.title("Python for big data | csv to orc")
        # self.app.geometry("600x400")
        self.__selected_folder = ''
        self.__modules_ihm = self.modules_ihm()
        self.__convertor = CsvToOrc()
        self.pack()

    @property
    def app(self):
        return self.__app
        

    def modules_ihm(self):

        modules_ihm = {}

        modules_ihm['info_lb'] = tk.Label(self.app, text= 'Select the directory where CSV files were created by absinthe.')
        modules_ihm['output_label'] = tk.Label(self.app, text= 'output')
        modules_ihm['dir_select_bt'] = tk.Button(self.app, text = 'Select Directory', command=self.folder_select) # TO COMPLETE command
        modules_ihm['exe_bt'] = tk.Button(self.app, text = 'Convert', command=self.start_convertor) # TO COMPLETE command
        modules_ihm['file_list'] = tk.Listbox(self.app)
        modules_ihm['selected_folder'] = tk.StringVar()

        return modules_ihm

    def pack(self):

        self.__modules_ihm['info_lb'].pack(padx=5,pady=5)
        self.__modules_ihm['dir_select_bt'].pack(padx=5,pady=5)
        self.__modules_ihm['file_list'].pack(padx=5,pady=5)
        self.__modules_ihm['exe_bt'].pack(padx=5,pady=5)
        self.__modules_ihm['output_label'].pack(padx=5,pady=5)

    def update_list(self):
        filesnames = listdir(self.__selected_folder)
        files = [f_name for f_name in filesnames if f_name.endswith('.csv')]
        self.__modules_ihm['file_list'].insert(0, files)

    def folder_select(self):
        self.__selected_folder = fd.askdirectory()
        self.__modules_ihm['selected_folder'].set(self.__selected_folder)
        self.update_list()

    def start_convertor(self):
        self.__convertor.to_ocr(self.__selected_folder)

    def run(self):
        self.app.mainloop()


In [6]:
CsvToOrcIhm().run()

INFO:        Files to create: Prenoms.csv
########## Creation of Prenoms.csv.orc ##############
PATH:        C:/Users/Administrateur/Desktop/Formation python Thibault Briand/py_to_orc/Prenoms.csv
SCHEMA:      struct<01_prenom:string,02_genre:string,03_langage:string,04_frequence:string>
INFO:       OK
INFO:       Time to write: 0:00:01.056998 s
