In [2]:
import glob 
import pandas as pd 
from bs4 import BeautifulSoup
from datetime import datetime 
import os
from itertools import chain


In [3]:
log_file = "log_file.txt" 
target_file = "transformed_data.csv" 

In [4]:
def extract_from_csv(file_to_process): 
    dataframe = pd.read_csv(file_to_process) 
    return dataframe 

In [5]:
def extract_from_json(file_to_process): 
    dataframe = pd.read_json(file_to_process, lines=True) 
    return dataframe 

In [6]:
def extract_from_xml(file_to_process):
    with open(file_to_process) as fp:
        soup = BeautifulSoup(fp,'xml') # create a beautifulsoup object from xml file
        
    persons_list = soup.find_all('person') # returns a list of all tag objects named 'person' in xml file
    df=pd.DataFrame(columns=['name','height','weight']) 
    name=[]  
    weight=[]
    height=[]
    
    for person in persons_list: # iterate over each xml tag named 'person' 
        name.append(person.find('name').string) # since .name is a reserved system attribute for the soup objects we can't use person.name.string
        weight.append(person.weight.string) # extract text from each weight tag and append it to a list
        height.append(person.height.string) # extract text from each height tag and append it to a list
        
    df['name']=name #assign each list to its respective column in the dataframe
    df['height']=height
    df['weight']=weight
    
    return df
        

In [7]:
def extract():
    #extract csv file:
    csv_files = glob.glob('source/*.csv') #pathname can be either absolute or \
    # relative (like ../../Tools/*/*.gif),
    all_csv = map(extract_from_csv,csv_files)
    
    json_files = glob.glob('source/*.json')
    all_json = map(extract_from_json,json_files)
    
    xml_files = glob.glob('source/*.xml')
    all_xml= map(extract_from_xml,xml_files)
    all_files= chain(all_csv,all_json,all_xml) # used itertools.chanin to combine map objects \
    # which are iterables. turn all into one big iterable. 
    return pd.concat(all_files,ignore_index=True) 
    
    

In [8]:

def transform(data): 
    '''Convert inches to meters and round off to two decimals 
    1 inch is 0.0254 meters. also we used broadcasting feature of dataframes '''
    data['height'] = round(data['height'].astype(float) * 0.0254,2) #converted data['height'] from string to float
    
    '''Convert pounds to kilograms and round off to two decimals 
    1 pound is 0.45359237 kilograms '''
    data['weight'] = round(data['weight'].astype(float) * 0.45359237,2) #converted data['weight'] from string to float
    
    return data 

In [9]:
def load_data(target_file, transformed_data): 
    transformed_data.to_csv(target_file) 

In [10]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) #returns current time as a text
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

In [11]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
     name  height  weight
0    alex    1.67   51.25
1    ajay    1.82   61.91
2   alice    1.76   69.41
3    ravi    1.73   64.56
4     joe    1.72   65.45
5    alex    1.67   51.25
6    ajay    1.82   61.91
7   alice    1.76   69.41
8    ravi    1.73   64.56
9     joe    1.72   65.45
10   alex    1.67   51.25
11   ajay    1.82   61.91
12  alice    1.76   69.41
13   ravi    1.73   64.56
14    joe    1.72   65.45
15   jack    1.74   55.93
16    tom    1.77   64.18
17  tracy    1.78   61.90
18   john    1.72   50.97
19   jack    1.74   55.93
20    tom    1.77   64.18
21  tracy    1.78   61.90
22   john    1.72   50.97
23   jack    1.74   55.93
24    tom    1.77   64.18
25  tracy    1.78   61.90
26   john    1.72   50.97
27  simon    1.72   50.97
28  jacob    1.70   54.73
29  cindy    1.69   57.81
30   ivan    1.72   51.77
31  simon    1.72   50.97
32  jacob    1.70   54.73
33  cindy    1.69   57.81
34   ivan    1.72   51.77
35  simon    1.72   50.97
36  jacob    1.70   5