In [4]:
%run DataCleaning.ipynb

In [5]:
import unittest
import pandas
import shutil
import os

from pyspark.sql import SparkSession
import azure.storage.blob
from azure.storage.blob import BlobServiceClient
from pyspark.sql.types import StructField, StructType, StringType, FloatType, IntegerType, ShortType

class TestDataCleaning(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        global blob_service_client, container1, container2, container3, \
        blob_location1, blob_location2, blob_location3
        
        # connect to storage account 
        blob_service_client = connect_blob()
        
        # input test data
        ## 1. hospital csv dataset
        container1 = "hospitals"
        blob_location1 = "raw_data/facilityprofile_2021-08-10.csv"
        
        ## 2. insurance excel dataset
        container2 = "insurance"
        blob_location2 = "raw_data/CAInsurances.xlsx"
        
        ## 3. container/file doesn't exist
        container3 = "patients"
        blob_location3 = "raw_data/patients.csv"
    
    @classmethod
    def tearDownClass(cls):
        
        
    def test_spark_session(self):
        spark = spark_session()
        self.assertIsInstance(spark, SparkSession)
        spark.stop()
        
    def test_connect_blob(self):
        blob_service_client = connect_blob()
        self.assertIsInstance(blob_service_client, BlobServiceClient)
    
    def test_get_blob_data(self):
        # Case 1: get blob data of hospital csv 
        container_client1, blob_data1 = get_blob_data(blob_service_client, container1, blob_location1)
        self.assertTrue(blob_data1.blob_name == blob_location1)
        
        # Case 2: get blob data of insurance excel
        container_client2, blob_data2 = get_blob_data(blob_service_client, container2, blob_location2)
        self.assertTrue(blob_data2.blob_name == blob_location2)
        
        # Case 3: get non-existent file
        self.assertRaises(FileNotFoundError, get_blob_data, blob_service_client, container3, blob_location3)
        
    def test_blob_to_temp(self):
        # Case 1: download hospital csv data to local temp file
        self.setup_test_blob_to_temp()        
        container_client1, blob_data1 = get_blob_data(blob_service_client, container1, blob_location1)
        blob_to_temp('csv', blob_data1)
        self.assertTrue(os.path.exists("temp.txt"))
        
        # Case 2: download insurance excel data to local temp file 
        self.setup_test_blob_to_temp()        
        container_client2, blob_data2 = get_blob_data(blob_service_client, container2, blob_location2)
        blob_to_temp('excel', blob_data2)
        self.assertTrue(os.path.exists("temp.xlsx"))
        
        # Case 3: give wrong file format
        self.assertRaises(ValueError, blob_to_temp, 'pdf', blob_data2)

    def setup_test_blob_to_temp(self):
        # delete temp files
        try:
            os.remove("temp.txt")
            os.remove("temp.xlsx")
        except OSError:
            pass

    def test_read_blob_data(self):
        # Case 1: hospital csv dataset
        container_client1, blob_data1 = get_blob_data(blob_service_client, container1, blob_location1)
        blob_to_temp('csv', blob_data1)
        pdf1 = read_blob_data('csv', dtype='unicode')
        self.assertIsInstance(pdf1, pd.core.frame.DataFrame)
        self.assertFalse(pdf1.empty)
        
        # Case 2: insurance excel dataset
        container_client2, blob_data2 = get_blob_data(blob_service_client, container2, blob_location2)
        blob_to_temp('excel', blob_data2)
        pdf2 = read_blob_data('excel')
        self.assertIsInstance(pdf2, pd.core.frame.DataFrame)
        self.assertFalse(pdf2.empty)
        
        # Case 3: give wrong file format 
        self.assertRaises(ValueError, read_blob_data, 'pdf')
        
    def test_export_to_blob(self):
        # Case 1: Upload local files (downloaded by setup_case1_test_export_to_blob) in test_export_to_blob_data
        container_client, folder = self.setup_case1_test_export_to_blob()
        export_to_blob(folder, '%s/output' % (folder), container_client)
        files = [l for l in container_client.list_blobs(name_starts_with='test_export_to_blob_data/output/')]
        filenames = [file['name'].split('/')[-1] for file in files]
        self.assertTrue(filenames == ['PPRRVU20_OCT.csv', 'PPRRVU20_OCT.txt', 'PPRRVU20_OCT.xlsx'])
        
        # Case 2: local folder doesn't exist
        self.setup_case1_test_export_to_blob()
        folder = 'data'
        self.assertRaises(FileNotFoundError, export_to_blob, folder, '%s/output' % (folder), container_client)    
        
        # Case 3: A file instead of a folder is given
        filepath = "./PPRRVU20_OCT.xlsx"
        self.assertRaises(NotADirectoryError, export_to_blob, filepath, '%s/output' % (folder), container_client)
        
    def setup_case1_test_export_to_blob(self):
        # add files to dbfs
        container = "testdata"
        folder = "test_export_to_blob_data"
        os.makedirs(folder, exist_ok=True)
        container_client = blob_service_client.get_container_client(container)
        
        for file in ['PPRRVU20_OCT.csv', 'PPRRVU20_OCT.txt', 'PPRRVU20_OCT.xlsx']:
            blob_location = "%s/%s" % (folder, file)
            blob_data = container_client.get_blob_client(blob_location)
            with open(blob_location, "wb") as blob:
                data = blob_data.download_blob()
                data.readinto(blob)
                blob.close()
        return container_client, folder
    
    def setup_case2_test_export_to_blob(self):
        if os.path.exists('data'):
            shutil.rmtree('data')
    
    def test_sheet_names(self):
        # set filepath as testdata container
        container = "testdata"
        container_client = blob_service_client.get_container_client(container)
        
        # Case 1: read excel file
        file = "test_sheet_names_data/20LOCCO.xlsx"
        self.assertEqual(sheet_names((file, container_client)), ("test_sheet_names_data/20LOCCO.xlsx" , ['09LOCCO']))
        
        # Case 2: read empty excel file
        file = "test_sheet_names_data/empty_excel.xlsx"
        self.assertEqual(sheet_names((file, container_client)), ("test_sheet_names_data/empty_excel.xlsx" , ['Sheet1']))
        
        # Case 3: read non-excel file
        file = "test_sheet_names_data/20LOCCO.txt"
        self.assertEqual(sheet_names((file, container_client)), (None, None))

    def test_read_files(self):
        # set filepath as testdata container
        container = "testdata"
        container_client = blob_service_client.get_container_client(container)
        
        # Case 1: excel sheet contains target form
        file = "test_read_files/106580996_CDM_All_2020.xlsx"
        sheet = "Common OP Procedures"
        self.assertEqual(read_files((file, sheet, container_client)), ("test_read_files/106580996_CDM_All_2020.xlsx" , "Common OP Procedures"))
   
        # Case 2: excel sheet doesn't contains target form
        file = "test_read_files/106580996_CDM_All_2020.xlsx"
        sheet = "Hospital CDM"
        self.assertEqual(read_files((file, sheet, container_client)), (None, None))
        
        # Case 3: non-excel file
        file = "test_read_files/106190521_Comments_2020.docx"
        self.assertEqual(read_files((file, sheet, container_client)), (None, None))
        

In [6]:
unittest.main(argv=[''], verbosity=2, exit=False)

  blob_service_client = connect_blob()
test_blob_to_temp (__main__.TestDataCleaning) ... ok
test_connect_blob (__main__.TestDataCleaning) ... ok
test_export_to_blob (__main__.TestDataCleaning) ... ok
test_get_blob_data (__main__.TestDataCleaning) ... ok
test_read_blob_data (__main__.TestDataCleaning) ... ok
test_read_files (__main__.TestDataCleaning) ... ok
test_sheet_names (__main__.TestDataCleaning) ... ok
test_spark_session (__main__.TestDataCleaning) ... ok

----------------------------------------------------------------------
Ran 8 tests in 14.314s

OK


<unittest.main.TestProgram at 0x1e8973933a0>