In [164]:
import os
import io
import boto3
from azure.storage.blob import BlobServiceClient
from google.cloud import storage
import PyPDF2

class FileReaderBase:
    def __init__(self, file_path):
        self.file_path = file_path
    
    def read_file(self):
        raise NotImplementedError("read_file method not implemented in base class")
    
    def parse_text(self):
        raise NotImplementedError("parse_text method not implemented in base class")

class LocalFileReader(FileReaderBase):
    def read_file(self):
        with open(self.file_path, 'rb') as f:
            return f.read()
    
    def parse_text(self):
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(self.read_file()))
        return '\n'.join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])

class AzureBlobFileReader(FileReaderBase):
    def __init__(self, connection_string, container_name, file_path):
        super().__init__(file_path)
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)
    
    def read_file(self):
        blob_client = self.container_client.get_blob_client(self.file_path)
        file_data = blob_client.download_blob()
        return file_data.cosntent_as_bytes()
    
    def parse_text(self):
        pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(self.read_file()))
        return '\n'.join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])

class AWSFileReader(FileReaderBase):
    def __init__(self, bucket_name, file_path):
        super().__init__(file_path)
        self.s3 = boto3.client('s3')
        self.bucket_name = bucket_name
    
    def read_file(self):
        response = self.s3.get_object(Bucket=self.bucket_name, Key=self.file_path)
        return response['Body'].read()
    
    def parse_text(self):
        pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(self.read_file()))
        return '\n'.join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])

class GCPFileReader(FileReaderBase):
    def __init__(self, bucket_name, file_path):
        super().__init__(file_path)
        self.storage_client = storage.Client()
        self.bucket_name = bucket_name
    
    def read_file(self):
        bucket = self.storage_client.get_bucket(self.bucket_name)
        blob = bucket.blob(self.file_path)
        return blob.download_as_bytes()
    
    def parse_text(self):
        pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(self.read_file()))
        return '\n'.join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])

# Example
local_reader = LocalFileReader('file.csv')
file_content = local_reader.read_file()
print(file_content)
local_reader = LocalFileReader('file.pdf')
file_content = local_reader.read_file()
parsed_text = local_reader.parse_text()
print(parsed_text)


PdfReadError: EOF marker not found