In [1]:
from src.data_pre_processing import (
    DataPreProcessingFactory,
    RemoveLinks, 
    RemoveDateTime,
    RemoveIrrelevantSentences,
    RemovePageNumbers,
    LowerCaseAndRemoveExtraSpaces
)

In [2]:
from src.data_extraction import (
    DataExtractionFactory,
    PDFTextExtractorForBreeds,
    MergeShortElements
)

In [3]:
from src.data_ingestion import (
    DataIngestionFactory, 
    TextDataIngestion
)

In [4]:
folder_path = "data/pdf_data"
ingestor = DataIngestionFactory(strategy=TextDataIngestion(folder_path))
file_dict = ingestor.ingest_data()

In [5]:
# for i, j in file_dict.items(): 
#     print(f"{i}, count: {len(j)}")

In [6]:
# file_dict -> dict that has all categories and list of pdf
# dict

breed_categories = ['Dogs', 'Cats', 'Horses', 'Reptiles']
blogs_categories = ['Bees', 'Birds', 'Fish', 'Ferrets', 'Chicken and Ducks', 'Chinchillas', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Pigs'
                    'Prairie Dogs', 'Rabbits', 'Rats', 'Sugar Gliders']


# Separate the dict for specific categories 
breed_dict = {k: v for k, v in file_dict.items() if k in breed_categories}
blogs_dict = {k: v for k, v in file_dict.items() if k in blogs_categories}

In [7]:
breed_dict.keys()
blogs_dict.keys()

dict_keys(['Bees', 'Birds', 'Chicken and Ducks', 'Chinchillas', 'Ferrets', 'Fish', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Rabbits', 'Rats', 'Sugar Gliders'])

In [8]:
# blogs 
sample = [
    {
        'Dogs': [
            {
                'unique_id': 12,
                'text': "para-para para para ....",
                'pet_type': 'dogs',
                'blog_name': 'Labrador',
                'topic': 'Labrador training',
                'keywords': ["training", "breeding", "food"],
                'token_count': 200,
                'chunk_size': 300,
                'age_group': 'adult'
            },
            {
                # ...
            }
        ],
        'Cats': [
                # ... 
        ] 
    }
]


sample_v1 = [
    {
        'birds': [
            { 
                'unique_id': 1,
                'text': "para-para para para ....",
                'pet_type': 'birds',
                'blog_name': 'Diet for birds',
                'topic': 'is wheat good for birds?',
                'keywords': ["diet", "breeding", "food"],
                'token_count': 200, 
                'chunk_size': 300, 
                'age_group': 'adult'
            }, 
            {
                'unique_id': 2,
                # .... 
            }
        ],
        'Rats': [
            # ....
        ]
    }
]

import os


blogs_data = {}
for pet_type, blogs_list in breed_dict.items():
    blogs_data[pet_type] = []
    for pdf in blogs_list:
        pdf_path = os.path.join(folder_path, pet_type, pdf)
        extractor = DataExtractionFactory(strategy=PDFTextExtractorForBreeds(pdf_path))
        text, subheadings = extractor.extract()
        blogs_data[pet_type].append({
            'text': text, # list
            'subheadings': subheadings, # list
            'topic': pdf.split('.pdf')[0]
        })

In [17]:
text_list = blogs_data.get('Dogs')[100]['text']
print(len(text_list))

28


In [18]:
subheadings = blogs_data.get('Dogs')[100]['subheadings']
print(len(subheadings))  

28


In [21]:
extractor.set_strategy(MergeShortElements(text_list, subheadings))
new_list, sh = extractor.extract()

Pre processing 

In [6]:
links = ['https://www.printfriendly.com', 'http://www.printfriendly.com',
            'https://petmd.com', 'http://www.petmd.com']

processor = DataPreProcessingFactory(strategy=RemoveLinks(links, text_list))
text_list = processor.process_data()

In [7]:
# remove date time 
processor.set_strategy(strategy=RemoveDateTime(text_list))
text_list = processor.process_data()

In [None]:
# Remove irrelevant Sentences 
sent = ['Hypothyroidism Hypothyroidism'] # just for testing 
processor.set_strategy(RemoveIrrelevantSentences(text_list, sent))
text_list = processor.process_data()

In [11]:
# Remove Page Numbers
processor.set_strategy(RemovePageNumbers(text_list))
text_list = processor.process_data()

In [13]:
# Lower Case and Remove Extra Spaces
processor.set_strategy(LowerCaseAndRemoveExtraSpaces(text_list))
text_list = processor.process_data()

In [16]:
text_list

['briard dog breed health and care petmd.com /dog/breeds/briard the briard is an ancient french herding dog with a rugged appearance and star- studded history. also called chien berger de brie (named after the region of france responsible for producing brie cheese), the briard dog breed dates back to at least the 8th century, as they can be seen depicted in tapestries alongside emperor charlemagne. roughly 1,000 years later, another emperor—napoleon bonaparte—reportedly kept briards, and thomas jefferson received a briard as a gift for serving as an u.s. ambassador to france. briard dogs were bred to both guard and herd sheep, and they’re exceptionally equipped for the role. recognized by the american kennel club in 1928, the official breed standard describes the briard’s movements as being like “quicksilver, permitting him to make [the] abrupt turns, springing starts, and sudden stops required of the sheepherding dog.” this description is even more impressive when you consider the bre