In [66]:
import io
import os
import PyPDF2
import requests
import boto3
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
from google.cloud import storage

# configure one of the following as a file source
# Local File ##########################################
file_path = 'file.pdf'
# File in Azure Blob Storage ##########################
container_name = 'content'
blob_name = 'file.pdf'
account_url = 'https://bpf7701sa.blob.core.windows.net/'
credential = DefaultAzureCredential() #'mycredential'
# File in AWS S3 #######################################
bucket_name = 'content'
object_name = 'file.csv'
aws_access_key_id = 'myaccesskey'
aws_secret_access_key = 'mysecretkey'
########################################################

storage = factory.get_storage('local').get_file('path', 'file.csv')
# text1 = read_pdf_local(file_path)
# print(text)
# text2 = read_pdf_azure(container_name, blob_name, account_url, credential)
# print(text)
# text3 = read_pdf_s3(bucket_name, object_name, aws_access_key_id, aws_secret_access_key)
# print(text)

# # Call the appropriate function based on the source of the PDF
# if os.path.isfile(file_path):
#     text = read_pdf_local(file_path)
# elif account_url and credential:
#     text = read_pdf_azure(container_name, blob_name, account_url, credential)
# elif aws_access_key_id and aws_secret_access_key:
#     text = read_pdf_s3(bucket_name, object_name, aws_access_key_id, aws_secret_access_key)
#print(text)

# Function to read PDF from local file
def read_pdf_local(file_path):
    with open(file_path, 'rb') as f:
        pdf = PdfReader(f)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to read PDF from Azure Blob Storage
def read_pdf_azure(container_name, blob_name, account_url, credential):
    blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    stream = io.BytesIO()
    blob_client.download_blob().download_to_stream(stream)
    stream.seek(0)
    pdf = PdfReader(stream)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
    return text

# Function to read PDF from AWS S3 Storage
def read_pdf_s3(bucket_name, object_name, aws_access_key_id, aws_secret_access_key):
    s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
    obj = s3.get_object(Bucket=bucket_name, Key=object_name)
    stream = io.BytesIO(obj['Body'].read())
    stream.seek(0)
    pdf = PdfReader(stream)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
    return text

class BaseStorage(ABC):
    @abstractmethod
    def get_file(self, bucket_name: str, blob_name: str) -> bytes:
        with open(blobl_name, 'rb') as f:
            return f.read()
        #pass

    # @abstractmethod
    # def get_local_file(self, file_path: str) -> bytes:
    #     pass

class AwsStorage(BaseStorage):
    def __init__(self, access_key: str, secret_key: str):
        self.client = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)

    def get_file(self, bucket_name: str, blob_name: str) -> bytes:
        response = self.client.get_object(Bucket=bucket_name, Key=blob_name)
        return response['Body'].read()

    # def get_local_file(self, file_path: str) -> bytes:
    #     with open(file_path, 'rb') as f:
    #         return f.read()

class AzureStorage(BaseStorage):
    def __init__(self, connection_string: str):
        self.client = BlobServiceClient.from_connection_string(connection_string)

    def get_file(self, bucket_name: str, blob_name: str) -> bytes:
        container_client = self.client.get_container_client(bucket_name)
        blob_client = container_client.get_blob_client(blob_name)
        return blob_client.download_blob().readall()

    # def get_local_file(self, file_path: str) -> bytes:
    #     with open(file_path, 'rb') as f:
    #         return f.read()

class GcpStorage(BaseStorage):
    def __init__(self, credentials_file: str):
        self.client = storage.Client.from_service_account_json(credentials_file)

    def get_file(self, bucket_name: str, blob_name: str) -> bytes:
        bucket = self.client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        return blob.download_as_bytes()

    # def get_local_file(self, file_path: str) -> bytes:
    #     with open(file_path, 'rb') as f:
    #         return f.read()

class LocalStorage(BaseStorage):
    def get_file(self, bucket_name: str, blob_name: str) -> bytes:
        with open(blob_name, 'rb') as f:
            return f.read()

    # def get_local_file(self, file_path: str) -> bytes:
    #     with open(file_path, 'rb') as f:
    #         return f.read()

class StorageFactory:
    @staticmethod
    def get_storage(storage_type: str, **kwargs) -> BaseStorage:
        if storage_type == 'aws':
            return AwsStorage(**kwargs)
        elif storage_type == 'azure':
            return AzureStorage(**kwargs)
        elif storage_type == 'gcp':
            return GcpStorage(**kwargs)
        elif storage_type == 'local':
            return LocalStorage(**kwargs)
        else:
            raise ValueError(f'Invalid storage type: {storage_type}')

PdfReadError: EOF marker not found