In [12]:
import os

In [1]:
import json
import pandas as pd
import glob
import re


In [2]:
# Schema JSON File 
schema_file_path = r'data\retail_db\schemas.json'

with open(schema_file_path, 'r') as schema_file:
    schemas = json.load(schema_file)

In [3]:
# Function - Get Column Level Details 
def get_column_name(schema, category, sort_key='column_position'):
    """This Function helps in getting column names sorted in order of column_position.
    Args:
        schema: json data loaded from schema.json file
        category: name of table (e.g. orders, customers etc.)
        sort_key: column_position is the default sort key used to sort the columns
    Returns: List of Column Names
    """
    try:
        if category in schema.keys():
            category_data = schema[category]
            columns = sorted(category_data, key=lambda col: col[sort_key])
            return [column['column_name'] for column in columns] 
        else:
            return None
    except Exception as ex:
        print(ex)

In [5]:
src_file_names  = glob.glob('data/retail_db/*/part-*')

In [6]:
sample_text = 'b/f\\s\\n'

In [7]:
sample_text.split('/')

['b', 'f\\s\\n']

In [8]:
sample_text.split('\\')

['b/f', 's', 'n']

In [9]:
re.split('[/\\\]', sample_text)

['b', 'f', 's', 'n']

In [10]:
for file in src_file_names:
    print(re.split('[/\\\]', file))

['data', 'retail_db', 'categories', 'part-00000']
['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']


In [11]:
for file in src_file_names:
    file_splitted = re.split('[/\\\]', file)
    file_name = file_splitted[-2]
    file_columns = get_column_name(schema=schemas, category=file_name)
    df = pd.read_csv(filepath_or_buffer=file, names=file_columns)
    print(f'Shape of {file_name} is {df.shape}')

Shape of categories is (58, 3)
Shape of customers is (12435, 9)
Shape of departments is (6, 2)
Shape of orders is (68883, 4)
Shape of order_items is (172198, 6)
Shape of products is (1345, 6)


In [41]:
def list_files(src_location, pattern=None):
    if pattern:
        src_location = f'{src_location}/{pattern}'
        
    return glob.glob(src_location, recursive=True)

In [39]:
def get_filename_from_path(file_path, pattern='[/\\\]'):
    splitted_file_path = re.split('[/\\\]', file_path)
    file_name = splitted_file_path[-2]
    return file_name

In [26]:
def read_df(file_path, column_names):
    df = pd.read_csv(filepath_or_buffer=file_path, names=column_names)
    return df

In [22]:
def create_target_folder(folder_name=None):
    target_base_path = r'data/structured/'
    if folder_name is None:
        folder_path = target_base_path
    else:
        folder_path = f'{target_base_path}/{folder_name}'
    return os.makedirs(name=target_base_path, exist_ok=True)

In [47]:
def generate_target_destination(tgt_base_dir, file_name, extension):
    target_base_path = tgt_base_dir
    file_name = f'{target_base_path}/{file_name}.{extension}'
    return file_name

In [29]:
def generate_json_doc(df, json_file_path):
    try:
        df.to_json(
            json_file_path,
            orient='records',
            lines=True
        )
        return True
    except Exception as ex:
        return False

In [40]:
for file_path in src_file_names:

    print(file_path)
    
    # FILE NAME
    file_name = get_filename_from_path(file_path)

    # COLUMNS
    column_names = get_column_name(schemas, file_name)

    # TO DF
    df = read_df(file_path, column_names)

    # TARGET FILE LOC
    json_file_path = generate_target_destination(file_name, 'json')

    # DF TO JSON
    status = generate_json_doc(df, json_file_path)

    if status:
        print(f'{file_name}.json created!')

data/retail_db\categories\part-00000
categories.json created!
data/retail_db\customers\part-00000
customers.json created!
data/retail_db\departments\part-00000
departments.json created!
data/retail_db\orders\part-00000
orders.json created!
data/retail_db\order_items\part-00000
order_items.json created!
data/retail_db\products\part-00000
products.json created!


In [48]:
def csv_to_json_converter(src_base_dir, tgt_bae_dir, ds_name):
    
    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    files = glob.glob(f'{src_base_dir}/{ds_name}/part-*')

    for file in files:
        # FILE NAME
        file_name = get_filename_from_path(file)

        # COLUMNS
        column_names = get_column_name(schemas, file_name)

        # TO DF
        df = read_df(file_path, column_names)

        # TARGET FILE LOC
        json_file_path = generate_target_destination(tgt_bae_dir, file_name, 'json')

        # DF TO JSON
        status = generate_json_doc(df, json_file_path)

        if status:
            print(f'{file_name}.json created!')


In [45]:
csv_to_json_converter('orders')

orders.json created!


In [46]:
csv_to_json_converter('categories')

categories.json created!
