In [None]:
'''Reference: https://www.geeksforgeeks.org/xml-parsing-python/
              https://python.plainenglish.io/converting-xml-to-csv-using-python-d723a3df3de1
'''

<h2> Creating a bucket in AWS </h2>

In [3]:
!pip install boto3 

Collecting boto3
  Using cached https://files.pythonhosted.org/packages/23/d8/3b41ce8c96dedbb449f24de21eee0742786f414fea176f984b1101154f30/boto3-1.24.84-py3-none-any.whl
Installing collected packages: boto3
Successfully installed boto3-1.24.84


In [4]:
import boto3

In [2]:
client = boto3.client('s3')

In [9]:
client.create_bucket(Bucket = 'my-dummy-bucket1',
                     CreateBucketConfiguration={'LocationConstraint': 'ap-south-1'})

{'ResponseMetadata': {'RequestId': 'WZWWCFRG0ARCKYA1',
  'HostId': 'lkqOL7Ya9mG9l9wQQiqyyCzSFh0qGytkA7jepQR1lCuoZKljkg11siD0AlKuFRApIWsAtOVH1C4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'lkqOL7Ya9mG9l9wQQiqyyCzSFh0qGytkA7jepQR1lCuoZKljkg11siD0AlKuFRApIWsAtOVH1C4=',
   'x-amz-request-id': 'WZWWCFRG0ARCKYA1',
   'date': 'Sat, 01 Oct 2022 21:22:22 GMT',
   'location': 'http://my-dummy-bucket1.s3.amazonaws.com/',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'Location': 'http://my-dummy-bucket1.s3.amazonaws.com/'}

<h2> List all buckets </h2>

In [2]:
client = boto3.client('s3')
response = client.list_buckets()

In [3]:
response['Buckets']

[{'Name': 'my-dummy-bucket1',
  'CreationDate': datetime.datetime(2022, 10, 1, 21, 22, 23, tzinfo=tzutc())},
 {'Name': 'my-test-case-bucket',
  'CreationDate': datetime.datetime(2022, 9, 30, 18, 49, 48, tzinfo=tzutc())}]

<h2> Functions for connecting to bucket and pushing csv to bucket </h2>

In [7]:
import json
import pandas as pd
import os
import logging
import sys
from io import BytesIO
from pandas import DataFrame
from s3_bucket import S3Bucket

logging.basicConfig(level = logging.INFO)

def connect_to_buffer_bucket(BUCKET_NAME=os.environ.get('BUCKET_NAME', 'my-dummy-bucket1'),
                             Access_key=os.environ.get('AWS_ACCESS_KEY_ID'),
                             Secret_key=os.environ.get('AWS_SECRET_ACCESS_KEY')):
    '''
    Connects to S3 bucket in order to use its functions
    
    Params:
            BUCKET_NAME: name of s3 bucket
            Access_key: Access key credentials of user to access the s3 bucket
            Secret_key: Secret key credentials of user to access the s3 bucket
    
    Returns: 
            bucket
    '''
    bucket = None
    
    try:
        bucket = S3Bucket(
            bucket=BUCKET_NAME,
            access_key_id=Access_key,
            secret_access_key=Secret_key
        )
        logging.info(f'Successfully connected to the bucket "{BUCKET_NAME}"')
        return bucket
    except Exception as e:
        logging.info(
            f'{e}\nError connecting to bucket "{BUCKET_NAME}": Please check the credentials again.'
        )


def push_csv_to_buffer_bucket(bucket: S3Bucket, dataframe: DataFrame, rel_path: str):
    '''
    Push the generated CSV to s3 bucket
    
    Params:
           bucket: name of imported class from module
           dataframe: dataframe generated through code
           rel_path: path of the directory the CSV will get stored
           
    Return:
           Message on successfully pushing CSV to bucket, if not raises error
           
    '''
    try:
        buffer = BytesIO(
            bytes(
                dataframe.to_csv(index=True, header=True),
                encoding="utf-8",
            )
        )
        bucket.upload_file(fileobj=buffer, key=rel_path)
        logging.info(f'Successfully pushed the csv file to "{rel_path}"')
    except Exception as e:
        logging.info(f'{e}\nError occured while pushing file to bucket path "{rel_path}"')
        logging.info(e)

<h2>Installing Selenium package</h2>

In [1]:
conda install selenium

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Asmita\Anaconda3_second

  added / updated specs:
    - selenium


The following packages will be UPDATED:

  anaconda                                    custom-py37_1 --> 2019.03-py37_0

The following packages will be DOWNGRADED:

  ca-certificates                     2022.07.19-haa95532_0 --> 2019.1.23-0
  certifi                          2022.9.14-py37haa95532_0 --> 2019.3.9-py37_0
  openssl                                 1.1.1c-he774522_1 --> 1.1.1b-he774522_1


Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done

Note: you may need to restart the kernel to use updated packages.




  current version: 4.12.0
  latest version: 22.9.0

Please update conda by running

    $ conda update -n base -c defaults conda




<h2> Importing necessary libraries </h2>

In [8]:
import csv
import requests
import xml.etree.ElementTree as ET
import xml.etree.ElementTree as Xet
import pandas as pd
import xmltodict
import os
import time
import requests
import pydoc
import zipfile
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import date



In [10]:
class CSV_conversion:
    def __init__(self, url):
        '''
             url: url given in main question
        '''
        self.url = url
        
    def get_xml(self):
        '''
            Save the data of provided link in a xml file
                  
            Return: 
                  Output in the form of .xml file is generated
        '''
        resp = requests.get(self.url)
  
        # saving the xml file
        with open('my_file.xml', 'wb') as f:
            f.write(resp.content)
        logging.info("Got the data in xml format from given URL")
          
  
    def parseXML(self, xmlfile):
        '''
             Traverse xml file to fetch the links to DLTINS files
             
             Params:
                     xmlfile: xml file generated in previous step
             Return:
                     list of downloadable links
        '''
        chrome_options = webdriver.ChromeOptions()
        prefs = {'download.default_directory' : 'C:\\Users\\Asmita\\AAIC_ASSIGNMENTS\\Assignment\\raw_directory'}
        chrome_options.add_experimental_option('prefs', prefs)
  
        wd = webdriver.Chrome(executable_path= r'C:/Users/Asmita/chromedriver_win32/chromedriver.exe',options=chrome_options)
    
        # create element tree object
        tree = ET.parse(xmlfile)

        # get root element
        root = tree.getroot()

      
        lst = []
        for item in root.iter('str'):
            if item.text.endswith('zip'):
                lst.append(item.text)
        new_lst = []
        for i in range(len(lst)):
            if i%2==0:
                wd.get(lst[i])
            else:
                new_lst.append(lst[i])
        logging.info("Parsed and got the files whose file_type is DLTINS")
        return new_lst
  
    def unzip_files(self, item, path):
        '''
            Extract the contents of the zipped files in a specific folder
            
            Params:
                    item: link of downloadable zip file
                    path: folder path where the zipped files are located
                
            Return:
                    A folder is generated with the xml files that needs to be converted to csv
        '''
        time.sleep(30)
        file = path + item
        with zipfile.ZipFile(file,"r") as zip_ref:
            zip_ref.extractall("target_dir")
        logging.info("Extracted the zipped files in a folder")

    def df_contents(self, i, key1, ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr):
        '''
             Append the values of specific dictionary key to a list
             
             Params:
                     i:               nested dictionary value 
                     key1:            key of dictionary
                     ID:              list of IDs elements
                     FullNm:          list of FullNm elements
                     ClassfctnTp:     list of ClassfctnTp elements
                     CmmdtyDerivInd:  list of CmmdtyDerivInd elements
                     NtnlCcy:         list of NtnlCcy elements
                     Issr:            list of Issr elements
        '''
        if key1 in i.keys():
            for k, v in i[key1]['FinInstrmGnlAttrbts'].items():
                if k == 'Id':
                    ID.append(v)
                elif k == 'FullNm':
                    FullNm.append(v)
                elif k == 'ClssfctnTp':
                    ClssfctnTp.append(v)
                elif k == 'CmmdtyDerivInd':
                    CmmdtyDerivInd.append(v)
                elif k == 'NtnlCcy':
                    NtnlCcy.append(v)
                else:
                    pass
                    Issr.append(i[key1]['Issr'])
        return ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr
  
    def savetoCSV(self, item):
        '''
            Convert contents of xml file to csv and push to bucket
            
            Params:
                    item: name of the .xml file
                
            Return:
                    Message of successfully pushing csv to bucket else raises error
        '''
        
        bucket = connect_to_buffer_bucket()
        path = 'my-dummy-bucket1/outputs/' + item.split(".")[0] + '_' + str(date.today()) + '.csv'

        file_path = 'C:\\Users\\Asmita\\AAIC_ASSIGNMENTS\\Assignment\\target_dir\\' + item
        with open(file_path, 'r', encoding="utf8") as file:
            filedata = file.read()
    
        # Converting xml to python dictionary (ordered dict)    
        data_dict = xmltodict.parse(filedata)

        lst = data_dict['BizData']['Pyld']['Document']['FinInstrmRptgRefDataDltaRpt']['FinInstrm']

        ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr = ([] for i in range(6))
        for i in lst:
            if 'TermntdRcrd' in i.keys():
                ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr = self.df_contents(i, 'TermntdRcrd',ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr)
            elif 'ModfdRcrd' in i.keys():
                ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr = self.df_contents(i, 'ModfdRcrd',ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr)
            elif 'NewRcrd' in i.keys():
                ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr = self.df_contents(i, 'NewRcrd',ID, FullNm, ClssfctnTp, CmmdtyDerivInd, NtnlCcy, Issr)
            else:
                pass

        data = pd.DataFrame({'FinInstrmGnlAttrbts.Id': ID, 'FinInstrmGnlAttrbts.FullNm': FullNm, 'FinInstrmGnlAttrbts.ClssfctnTp' :	ClssfctnTp, 'FinInstrmGnlAttrbts.CmmdtyDerivInd': CmmdtyDerivInd, 'FinInstrmGnlAttrbts.NtnlCcy': NtnlCcy, 'Issr': Issr})
    
        push_csv_to_buffer_bucket(bucket, data, path)
        file.close()
      
    def main(self):
        '''
            main function that gives call to all internal functions to get to the neccessary output
            
        '''
        start_time = time.time()
        self.get_xml()
        
        if not os.path.exists('raw_directory'):
            os.makedirs('raw_directory')
  
        # parse xml file
        result = self.parseXML('my_file.xml')

        raw_directory_path = 'C:\\Users\\Asmita\\AAIC_ASSIGNMENTS\\Assignment\\raw_directory\\'
        
        # #unzip dowmloaded files
        for item in result:
            self.unzip_files(item, raw_directory_path)

        # save csv to bucket
        for files in os.listdir("target_dir"):
            if files.endswith('.xml'):
                self.savetoCSV(files)
        logging.info("Process finished --- %s seconds ---" % (time.time() - start_time))
      
      
if __name__ == "__main__":
    
    # calling main function
    obj = CSV_conversion('https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml&indent=true&start=0&rows=100')
    obj.main()

INFO:root:Got the data in xml format from given URL
INFO:root:Parsed and got the files whose file_type is DLTINS
INFO:root:Extracted the zipped files in a folder
INFO:root:Extracted the zipped files in a folder
INFO:root:Extracted the zipped files in a folder
INFO:root:Extracted the zipped files in a folder
INFO:root:Successfully connected to the bucket "my-dummy-bucket1"
INFO:root:Successfully pushed the csv file to "my-dummy-bucket1/outputs/DLTINS_20210117_01of01_2022-10-04.csv"
INFO:root:Successfully connected to the bucket "my-dummy-bucket1"
INFO:root:Successfully pushed the csv file to "my-dummy-bucket1/outputs/DLTINS_20210118_01of01_2022-10-04.csv"
INFO:root:Successfully connected to the bucket "my-dummy-bucket1"
INFO:root:Successfully pushed the csv file to "my-dummy-bucket1/outputs/DLTINS_20210119_01of02_2022-10-04.csv"
INFO:root:Successfully connected to the bucket "my-dummy-bucket1"
INFO:root:Successfully pushed the csv file to "my-dummy-bucket1/outputs/DLTINS_20210119_02of02