# 145. Get File Names to be processed using glob

In [1]:
import glob
from email.feedparser import headerRE

In [2]:
help(glob)

Help on module glob:

NAME
    glob - Filename globbing utility.

MODULE REFERENCE
    https://docs.python.org/3.12/library/glob.html

    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

FUNCTIONS
    escape(pathname)
        Escape all special characters.

    glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False)
        Return a list of paths matching a pathname pattern.

        The pattern may contain simple shell-style wildcards a la
        fnmatch. Unlike fnmatch, filenames starting with a
        dot are special cases that are not matched by '*' and '?'
        patterns by default.

        If `include_hidden` is true, the patterns '*', '?', '**'  will match hidden
        directorie

In [3]:
help(glob.glob)

Help on function glob in module glob:

glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False)
    Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. Unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns by default.

    If `include_hidden` is true, the patterns '*', '?', '**'  will match hidden
    directories.

    If `recursive` is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.



In [4]:
glob.glob('data/retail_db/*/**', recursive=True)

['data/retail_db\\categories\\',
 'data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\',
 'data/retail_db\\products\\part-00000']

In [5]:
glob.glob('data/retail_db/*/**')

['data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\part-00000']

In [6]:
src_file_names = glob.glob('data/retail_db/*/part-*')

In [7]:
src_file_names

['data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\part-00000']

In [8]:
import pandas as pd

In [11]:
for file_name in src_file_names:
    df = pd.read_csv(file_name, header=None)
    print(f'Shape of {file_name} is {df.shape}')

Shape of data/retail_db\categories\part-00000 is (58, 3)
Shape of data/retail_db\customers\part-00000 is (12435, 9)
Shape of data/retail_db\departments\part-00000 is (6, 2)
Shape of data/retail_db\orders\part-00000 is (68883, 4)
Shape of data/retail_db\order_items\part-00000 is (172198, 6)
Shape of data/retail_db\products\part-00000 is (1345, 6)


# 146. Get Column Names using Schemas File

In [14]:
import json

In [15]:
def get_column_names(schemas, df_name, sorting_key='column_position'):
    column_details = schemas[df_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [16]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [17]:
order_columns = get_column_names(schemas, 'orders')

In [18]:
order_columns

['order_id', 'order_date', 'order_customer_id', 'order_status']

In [19]:
import pandas as pd

In [20]:
orders = pd.read_csv('data/retail_db/orders/part-00000', names=order_columns)

In [21]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


# 147. Get Data Set Names from File Names or Paths using regular expressions

In [22]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [23]:
import glob

In [24]:
src_file_names = glob.glob('data/retail_db/*/part*')

In [25]:
src_file_names

['data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\part-00000']

In [26]:
import pandas as pd

In [27]:
import re

In [32]:
for file in src_file_names:
    print(re.split(r'[/\\\\]', file))

['data', 'retail_db', 'categories', 'part-00000']
['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']


# 148. Read CSV Data into Pandas Dataframe with Schema Dynamically

# 149. Generate File Paths for Target JSON Files Dynamically

In [33]:
import re

In [34]:
for file in src_file_names:
    file_path_list = re.split(r'[/\\\\]', file)
    print(file_path_list)

['data', 'retail_db', 'categories', 'part-00000']
['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']


In [44]:
tgt_base_dir = 'data/retail_db_json/'

In [36]:
file = src_file_names[0]

In [38]:
file_list = re.split(r'[/\\\\]', file)

In [41]:
file_name = file_list[-1]

In [42]:
ds_name = file_list[-2]

In [45]:
f'{tgt_base_dir}/{ds_name}/{file_name}'

'data/retail_db_json//categories/part-00000'

# 150. Recap of Writing Pandas Dataframe to JSON File

In [46]:
import pandas as pd

In [47]:
columns = [
    'order_id', 'order_date',
    'order_customer_id', 'order_status',
]

In [48]:
df = pd.read_csv(
    'data/retail_db/orders/part-00000',
    names=columns
)

In [49]:
df

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [50]:
import os
os.makedirs('data/retail_db_json/orders', exist_ok=True)

In [51]:
df.to_json(
    'data/retail_db_json/orders/part-00000',
    orient='records',
    lines=True
)

# 151. Write Pandas Dataframe to JSON Files

# 152. Modularize File Format Converter for Dataset

# 153. Wrapper to Process all Data Sets

# 154. Setup Project for File Format Converter using Python

# 155. Install Dependencies for the Python Project using pip

# 156. Add Core Logic to Python Application

# 157. Overview of Run-time Arguments and Environment Variables

# 158. Using Run Time Arguments in Python Applications

# 159. Overview of Environment Variables

# 160. Setting Environment Variables on Windows or Mac or Linux