#Objective
- Extract data from csv, json, and xml files
- Transform the price data to 2 deimcal places
- Load the data to a csv file to prepare for uploading to a DB

In [29]:
import glob 
import pandas as pd  
from datetime import datetime 
import os
import urllib.request
import zipfile


In [30]:

# Define the URL and the local filename
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/datasource.zip '
local_filename = "source.zip"

# Download the file from the URL
urllib.request.urlretrieve(url, local_filename)

# Unzip the file
with zipfile.ZipFile(local_filename, 'r') as zip_ref:
    zip_ref.extractall("extracted_files")

#Global variables used by various functions
log_file = "log_file.txt" #store all logs
target_file = "transformed_data.csv" #Store final output data that can be loaded to a database 

In [31]:
#Extract data from csv, json, xml

def extract_from_csv(file):
    df = pd.read_csv(file)
    return df

def extract_from_json(file):
    df = pd.read_json(file, lines=True)
    return df

def extract_from_xml(file):
    df= pd.read_xml(file)
    return df



In [32]:

def extract(directory): 
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel']) # create an empty data frame to hold extracted data 
     
    # process all csv files 
    for csvfile in glob.glob(os.path.join(directory, "*.csv")): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob(os.path.join(directory, "*.json")): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob(os.path.join(directory, "*.xml")): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data 


In [33]:
def transform(data): 
    '''Round the price data to 2 decimal places.'''
    data['price'] = round(data.price,2)     
    return data 

In [34]:
def load_data(target_file, transformed_data): 
    transformed_data.to_csv(target_file) 

In [35]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

In [36]:
        
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract('extracted_files') 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
        car_model year_of_manufacture     price    fuel
0            ritz                2014   5000.00  Petrol
1             sx4                2013   7089.55  Diesel
2            ciaz                2017  10820.90  Petrol
3         wagon r                2011   4253.73  Petrol
4           swift                2014   6865.67  Diesel
..            ...                 ...       ...     ...
85     etios liva                2014   7089.55  Diesel
86         innova                2017  29477.61  Petrol
87       fortuner                2010  13805.97  Diesel
88  corolla altis                2011   6492.54  Petrol
89  corolla altis                2016  21268.66  Petrol

[90 rows x 4 columns]
