In [2]:
from src.data_pre_processing import (
    DataPreProcessingFactory,
    RemoveLinks, 
    RemoveDateTime,
    RemoveIrrelevantSentences,
    RemovePageNumbers,
    LowerCaseAndRemoveExtraSpaces
)

In [3]:
from src.data_extraction import (
    DataExtractionFactory,
    PDFTextExtractorForBreeds,
    MergeShortElements
)

In [4]:
from src.data_ingestion import (
    DataIngestionFactory, 
    TextDataIngestion
)

In [5]:
folder_path = "pdf_data"
ingestor = DataIngestionFactory(strategy=TextDataIngestion(folder_path))
file_dict = ingestor.ingest_data()

In [6]:
for i, j in file_dict.items(): 
    print(f"{i}, count: {len(j)}")

Bees, count: 1
Birds, count: 105
Cats, count: 60
Chicken and Ducks, count: 10
Chinchillas, count: 38
Dogs, count: 241
Ferrets, count: 81
Fish, count: 92
Gerbils, count: 18
Guinea Pigs, count: 43
Hamsters, count: 46
Horses, count: 133
Pigs, count: 3
Prairie Dogs, count: 12
Rabbits, count: 112
Rats, count: 33
Reptiles, count: 12
Sugar Gliders, count: 4


In [7]:
# file_dict -> dict that has all categories and list of pdf
# dict

breed_categories = ['Dogs', 'Cats', 'Horses', 'Reptiles']
blogs_categories = ['Bees', 'Birds', 'Fish', 'Ferrets', 'Chicken and Ducks', 'Chinchillas', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Pigs'
                    'Prairie Dogs', 'Rabbits', 'Rats', 'Sugar Gliders']


# Separate the dict for specific categories 
breed_dict = {k: v for k, v in file_dict.items() if k in breed_categories}
blogs_dict = {k: v for k, v in file_dict.items() if k in blogs_categories}

In [8]:
breed_dict.keys()
blogs_dict.keys()

dict_keys(['Bees', 'Birds', 'Chicken and Ducks', 'Chinchillas', 'Ferrets', 'Fish', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Rabbits', 'Rats', 'Sugar Gliders'])

In [None]:
# blogs 
sample = [
    {
        'Dogs': [
            {
                'unique_id': 12,
                'text': "para-para para para ....",
                'pet_type': 'dogs',
                'pet_breed': 'Labrador',
                'topic': 'Labrador training',
                'keywords': ["training", "breeding", "food"],
                'token_count': 200,
                'chunk_size': 300,
                'age_group': 'adult'
            },
            {
                # ...
            }
        ],
        'Cats': [
                # ... 
        ] 
    }
]


sample_v1 = [
    {
        'birds': [
            { 
                'unique_id': 1,
                'text': "para-para para para ....",
                'pet_type': 'birds',
                'blog_name': 'Diet for birds',
                'topic': 'is wheat good for birds?',
                'keywords': ["diet", "breeding", "food"],
                'token_count': 200, 
                'chunk_size': 300, 
                'age_group': 'adult'
            }, 
            {
                'unique_id': 2,
                # .... 
            }
        ],
        'Rats': [
            # ....
        ]
    }
]

import os


blogs_data = {}
for pet_type, blogs_list in blogs_dict.items():
    blogs_data[pet_type] = []
    for pdf in blogs_list:
        pdf_path = os.path.join(folder_path, pet_type, pdf)
        extractor = DataExtractionFactory(strategy=PDFTextExtractorForBreeds(pdf_path))
        text, subheadings = extractor.extract()
        blogs_data[pet_type].append({
            'text': text, # list
            'subheadings': subheadings, # list
            'topic': pdf.split('.pdf')[0]
        })

In [25]:
# blogs_data['Birds']

['10 Bird Toys That Are Safe petmd.com /bird/care/10-bird-toys-are-safe When shopping for your bird, you’ll find so many types of bird toys—mirrors, ladders, swings and ropes—that the choices are almost overwhelming. Some of the bird toys might seem flimsy, making you wonder if your bird will break them on the first use. Other toys look like fun, but you remember that she hid from the last thirty-dollar toy you got her. So, how do you decide what to buy? How many bird toys does a bird really need, anyhow? Are all of those ropes and artificial colors safe?',
 'Picking Bird Toys Based on Cage Size and Function One of the first things to decide is where your bird’s new toy will go—inside or outside the cage—and in what section of the cage. Make sure your bird’s cage is large enough for the bird toys you want to introduce. Bird toys that are going to stay outside the cage can obviously be larger and more elaborate than those inside the cage. For larger birds, there are many bird playstands

In [None]:
blogs_data['Birds'][0]['subheadings'] # category

['10 Bird Toys That Are Safe',
 'Picking Bird Toys Based on Cage Size and Function',
 'Which Type of Bird Toys Should You Get?',
 'How Many Bird Toys Should a Bird Have?',
 'What Should You Look for in a Bird Toy?',
 'Things to Consider When Choosing the Best Bird Toys',
 'Bird Toy Options by Size']

In [3]:
# get pdf
pdf_path = "pdf_data\dogs\Dogs_Breeds_Briard.pdf"
# extract 
extractor = DataExtractionFactory(strategy=PDFTextExtractorForBreeds(pdf_path))
text_list, sh = extractor.extract()

In [4]:
extractor.set_strategy(MergeShortElements(text_list))
text_list = extractor.extract()

In [5]:
text_list

['Briard Dog Breed Health and Care petmd.com /dog/breeds/briard The Briard is an ancient French herding dog with a rugged appearance and star- studded history. Also called Chien Berger de Brie (named after the region of France responsible for producing Brie cheese), the Briard dog breed dates back to at least the 8th century, as they can be seen depicted in tapestries alongside Emperor Charlemagne. Roughly 1,000 years later, another emperor—Napoleon Bonaparte—reportedly kept Briards, and Thomas Jefferson received a Briard as a gift for serving as an U.S. ambassador to France. Briard dogs were bred to both guard and herd sheep, and they’re exceptionally equipped for the role. Recognized by the American Kennel Club in 1928, the official breed standard describes the Briard’s movements as being like “quicksilver, permitting him to make [the] abrupt turns, springing starts, and sudden stops required of the sheepherding dog.” This description is even more impressive when you consider the bre

Pre processing 

In [6]:
links = ['https://www.printfriendly.com', 'http://www.printfriendly.com',
            'https://petmd.com', 'http://www.petmd.com']

processor = DataPreProcessingFactory(strategy=RemoveLinks(links, text_list))
text_list = processor.process_data()

In [7]:
# remove date time 
processor.set_strategy(strategy=RemoveDateTime(text_list))
text_list = processor.process_data()

In [None]:
# Remove irrelevant Sentences 
sent = ['Hypothyroidism Hypothyroidism'] # just for testing 
processor.set_strategy(RemoveIrrelevantSentences(text_list, sent))
text_list = processor.process_data()

In [11]:
# Remove Page Numbers
processor.set_strategy(RemovePageNumbers(text_list))
text_list = processor.process_data()

In [13]:
# Lower Case and Remove Extra Spaces
processor.set_strategy(LowerCaseAndRemoveExtraSpaces(text_list))
text_list = processor.process_data()

In [16]:
text_list

['briard dog breed health and care petmd.com /dog/breeds/briard the briard is an ancient french herding dog with a rugged appearance and star- studded history. also called chien berger de brie (named after the region of france responsible for producing brie cheese), the briard dog breed dates back to at least the 8th century, as they can be seen depicted in tapestries alongside emperor charlemagne. roughly 1,000 years later, another emperor—napoleon bonaparte—reportedly kept briards, and thomas jefferson received a briard as a gift for serving as an u.s. ambassador to france. briard dogs were bred to both guard and herd sheep, and they’re exceptionally equipped for the role. recognized by the american kennel club in 1928, the official breed standard describes the briard’s movements as being like “quicksilver, permitting him to make [the] abrupt turns, springing starts, and sudden stops required of the sheepherding dog.” this description is even more impressive when you consider the bre