In [37]:
import glob 
import pandas as pd 
from bs4 import BeautifulSoup
from datetime import datetime 
import os
from itertools import chain


In [12]:
log_file = "log_file.txt" 
target_file = "transformed_data.csv" 

In [36]:
def extract_from_csv(file_to_process): 
    dataframe = pd.read_csv(file_to_process) 
    return dataframe 

In [4]:
def extract_from_json(file_to_process): 
    dataframe = pd.read_json(file_to_process, lines=True) 
    return dataframe 

In [19]:
def extract_from_xml(file_to_process):  # prefer beautiful soup for parsing xml
    with open(file_to_process) as fp:
        soup = BeautifulSoup(fp,'xml') # create a beautifulsoup object from xml file
        
    cars_list = soup.find_all('row') # returns a list of all tag objects named 'row' in xml file
    df=pd.DataFrame(columns=['car_model','year_of_manufacture','price','fuel']) 
    car_model=[]  
    year_of_manufacture=[]
    price=[]
    fuel=[]
    
    for car in cars_list: # iterate over each xml tag named 'row' 
        car_model.append(car.car_model.string) # extract text from each car_model tag and append it to a list 
        year_of_manufacture.append(car.year_of_manufacture.string) # extract text from each year_of_manufacture tag and append it to a list
        price.append(car.price.string) # extract text from each price tag and append it to a list
        fuel.append(car.fuel.string)
        
    df['car_model'] = car_model #assign each list to its respective column in the dataframe
    df['year_of_manufacture'] = year_of_manufacture
    df['price'] = price
    df['fuel'] = fuel
    return df

In [35]:
def extract():
    
    #extract csv file:
    csv_files = glob.glob('datasource/*.csv') #pathname can be either absolute or relative (like ../../Tools/*/*.gif),
    all_csv = map(extract_from_csv,csv_files)
    
    json_files = glob.glob('datasource/*.json')
    all_json = map(extract_from_json,json_files)
    
    xml_files = glob.glob('datasource/*.xml')
    all_xml= map(extract_from_xml,xml_files)
    all_files= chain(all_csv,all_json,all_xml) # used itertools.chanin to combine map objects which are iterables
    return pd.concat(all_files,ignore_index=True) 
    
    

In [40]:
def transform(data): 
    # we use broadcasting feature of dataframes 
    data['price'] = round(data['price'].astype(float),2) #rounded data['price'] to 2 decimal places
    return data 

In [41]:
def load_data(target_file, transformed_data): 
    transformed_data.to_csv(target_file) 

In [42]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) #returns current time as a text
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

In [47]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(transformed_data) #i temporarily remove limits to display all rows.
    
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
        car_model year_of_manufacture     price    fuel
0            ritz                2014   5000.00  Petrol
1             sx4                2013   7089.55  Diesel
2            ciaz                2017  10820.90  Petrol
3         wagon r                2011   4253.73  Petrol
4           swift                2014   6865.67  Diesel
5   vitara brezza                2018  13805.97  Diesel
6            ciaz                2015  10074.63  Petrol
7         s cross                2015   9701.49  Diesel
8            ciaz                2016  13059.70  Diesel
9            ciaz                2015  11119.40  Diesel
10       alto 800                2017   4253.73  Petrol
11           ciaz                2015  10223.88  Diesel
12           ciaz                2015  11194.03  Petrol
13         ertiga                2015   9104.48  Petrol
14          dzire                2009   3358.21  Petrol
15         ertiga                2016  11567.16  Diesel
16         ertiga              