In [1]:
from src.data_pre_processing import (
    DataPreProcessingFactory,
    RemoveLinks, 
    RemoveDateTime,
    RemoveIrrelevantSentences,
    RemovePageNumbers,
    LowerCaseAndRemoveExtraSpaces
)

In [2]:
from src.data_extraction import (
    DataExtractionFactory,
    PDFTextExtractorForBreeds,
    MergeShortElements
)

In [3]:
from src.data_ingestion import (
    DataIngestionFactory, 
    TextDataIngestion
)

In [4]:
folder_path = "data/pdf_data"
ingestor = DataIngestionFactory(strategy=TextDataIngestion(folder_path))
file_dict = ingestor.ingest_data()

In [5]:
# for i, j in file_dict.items(): 
#     print(f"{i}, count: {len(j)}")

In [6]:
# file_dict -> dict that has all categories and list of pdf
# dict

breed_categories = ['Dogs', 'Cats', 'Horses', 'Reptiles']
blogs_categories = ['Bees', 'Birds', 'Fish', 'Ferrets', 'Chicken and Ducks', 'Chinchillas', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Pigs'
                    'Prairie Dogs', 'Rabbits', 'Rats', 'Sugar Gliders']


# Separate the dict for specific categories 
breed_dict = {k: v for k, v in file_dict.items() if k in breed_categories}
blogs_dict = {k: v for k, v in file_dict.items() if k in blogs_categories}

In [7]:
breed_dict.keys()
blogs_dict.keys()

dict_keys(['Bees', 'Birds', 'Chicken and Ducks', 'Chinchillas', 'Ferrets', 'Fish', 'Gerbils', 'Guinea Pigs', 'Hamsters', 'Rabbits', 'Rats', 'Sugar Gliders'])

In [16]:
# blogs 
sample = [
    {
        'Dogs': [
            {
                'unique_id': 12,
                'text': "para-para para para ....",
                'pet_type': 'dogs',
                'blog_name': 'Labrador',
                'topic': 'Labrador training',
                'keywords': ["training", "breeding", "food"],
                'token_count': 200,
                'chunk_size': 300,
                'age_group': 'adult'
            },
            {
                # ...
            }
        ],
        'Cats': [
                # ... 
        ] 
    }
]


sample_v1 = [
    {
        'birds': [
            { 
                'unique_id': 1,
                'text': "para-para para para ....",
                'pet_type': 'birds',
                'blog_name': 'Diet for birds',
                'topic': 'is wheat good for birds?',
                'keywords': ["diet", "breeding", "food"],
                'token_count': 200, 
                'chunk_size': 300, 
                'age_group': 'adult'
            }, 
            {
                'unique_id': 2,
                # .... 
            }
        ],
        'Rats': [
            # ....
        ]
    }
]

import os


blogs_data = {}
for pet_type, blogs_list in breed_dict.items():
    blogs_data[pet_type] = []
    for pdf in blogs_list:
        pdf_path = os.path.join(folder_path, pet_type, pdf)
        extractor = DataExtractionFactory(strategy=PDFTextExtractorForBreeds(pdf_path))
        text, subheadings = extractor.extract()
        merger = DataExtractionFactory(strategy=MergeShortElements(text, subheadings))
        text, topic = merger.extract()
        blogs_data[pet_type].append({
            'text': text, # list
            'topic': topic, # list
            'blog_name': pdf.split('.pdf')[0]
        })

sample

In [9]:
text_list = blogs_data.get('Dogs')[100]['text']
print(len(text_list))

27


In [10]:
topic = blogs_data.get('Dogs')[100]['topic']
print(len(topic))  

27


In [11]:
blogs_data = {
    'Cats': [
        {
            'text': ['para1', 'para2'],
            'topic': ['topic1','topic2'],
            'blog_name': 'blog1'
        },
        {
            # ....
        }
        # ... 
    ],
    'Dogs': [
        {
            # ...
        }
    ]
}

Pre processing 

In [59]:
data_pre_processing_pipeline = [
        RemoveLinks,
        RemoveDateTime,
        RemoveIrrelevantSentences,
        RemovePageNumbers,
        LowerCaseAndRemoveExtraSpaces
    ]

In [60]:
processed_data = {}
for step in data_pre_processing_pipeline:
        # loop in data dict
        for pet_type, blog_list in blogs_data.items():
            processed_data[pet_type] = []
            for e, blog in enumerate(blog_list):
                # print(f"blog: {blog}")
                processor = DataPreProcessingFactory(strategy=step(blog.get('text')))
                processed_text = processor.process_data()
                processed_data[pet_type].append(
                    {
                        "text": processed_text,
                        "topic": blog.get('topic'),
                        "blog_name": blog.get('blog_name')
                    }
                )
                # print(f"processing {pet_type} blog {e}")

In [61]:
processed_data['Horses'][0]

{'text': ['abtenauer | petmd petmd.com /horse/breeds/c_hr_abtenauer the abtenauer is a horse breed of german extraction. an easy trotter, it is particularly graceful and fluid in its movements. although of average size, the abtenauer is actually a draft horse and accustomed to pulling heavy loads on mountainous terrain. unfortunately, the abtenauer has become a rare breed.',
  'physical characteristics standing at about 14.2 to 15.1 hands high (57-60 inches, 144-155 centimeters), the abtenauer is rather small and stocky breed with a well-defined head. it has powerful, muscular legs and great balance. it is also agile, strong, and sure-footed. these characteristics make it especially suitable for navigating mountainous, rugged terrain. curiously, the abtenauer has curly hair when it is born, but this is shed along with its baby hair in the first year. it is found in various colors, including rich brown and chestnut, but there are also a lot of blue roans and blacks. spotted abtenauers, 

In [62]:
# code to separate the lists inside the 'text' key into individual dicts along with their respective topic and the blog name and save all these lists in their respective key in a new dict
gigo = processed_data.copy()

In [63]:
siso = {}
for pet_name, blog_list in gigo.items(): 
    # pet_name -> 'Cats'
    # blog_list -> [('text', 'topic', 'blog_name'), ('text', 'topic', 'blog_name')....]
    siso[pet_name] = []
    for blog in blog_list:
        text_list = blog.get('text')
        topic_list = blog.get('topic')
        blog_name = blog.get('blog_name')
        for i, text in enumerate(text_list):
            siso[pet_name].append(
                {
                    'text': text,
                    'topic': topic_list[i],
                    'blog_name': blog_name
                }
            )


In [74]:
siso.get('Cats')

[{'text': 'abyssinian cat breed health and care petmd.com /cat/breeds/abyssinian abyssinian cats, also known as abys, have a striking resemblance to the cats depicted in ancient egyptian murals. they have almond-shaped eyes, perky ears, and a slender build. some people believe abyssinians are the direct descendants of these revered ancient cats, while others suggest they originated in what is now ethiopia and journeyed alongside british soldiers to england, according to the abyssinian cat club. all abyssinian cats have an agouti coat, which means each strand of hair flaunts multiple colors: a dark band, a light band, and a dark tip. this gives their silky fur a salt-and-pepper, or ticked, appearance.',
  'topic': 'Abyssinian Cat Breed Health and Care',
  'blog_name': 'Cats_Breeds_Abyssinian'},
 {'text': "caring for an abyssinian abys are medium-size kitties who stand 8–10 inches tall and weigh 8–12 pounds. they’re loyal, intelligent, and energetic. while they require plenty of mental a

In [1]:
import os 
import json 
import pandas as pd 

In [2]:
dummy_data = {
    'cats': [
        {
            'id': 1,
            'text': 'para1',
            'metadata': {
                'topic': 'topic1',
                'blog_name': 'blog1'
            }
        },
        {
            'id': 2,
            'text': 'para2',
            'metadata': {
                'topic': 'topic2',
                'blog_name': 'blog2'
            }
        }
    ],
    'dogs': [
        {
            'id': 3,
            'text': 'para3',
            'metadata': {
                'topic': 'topic3',
                'blog_name': 'blog3'
            }
        }
    ]
}

In [4]:
path = 'dummy/data'

for category, items in dummy_data.items():
        for item in items:
            item.update(item.pop('metadata'))

In [8]:
dummy_data.get('dogs')

[{'id': 3, 'text': 'para3', 'topic': 'topic3', 'blog_name': 'blog3'}]

In [9]:
for key, value_list in dummy_data.items():
        df = pd.DataFrame(value_list)
        df.to_csv(f"temp/{key}.csv", index=False)