# Import Required Libraries
Import the necessary libraries, including PDFScraper.

In [49]:
# Import the necessary libraries
import os
import re
import pandas as pd
from PyPDF2 import PdfReader

# Find All Disney Lorcana PDF Files
Find All PDF Files in `data/raw/disney_lorcana`

In [53]:
# Assign the directory where the PDF files are stored
pdf_directory = "data/raw/disney_lorcana/set_checklist"

# List all the PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

# Verify the PDF files were found
print(f"Found PDF files: {pdf_files}")

Found PDF files: ['into-the-inklands.pdf', 'rise-of-the-floodborn.pdf', 'shimmering-skies.pdf', 'the-first-chapter.pdf', 'ursulas-return.pdf']


# Load and Process Each PDF File
Load and Process Each PDF File in `pdf_files`

In [83]:
# Create an empty DataFrame to store the extracted data
df = pd.DataFrame()

# Load and Process Each PDF File
for pdf_file in pdf_files:
    # Create the path to the PDF file
    pdf_path = os.path.join(pdf_directory, pdf_file)

    # Verify the PDF file path
    print(f"Processing PDF file: {pdf_path}")

    # Extract the Set Name from the PDF File
    set_name = pdf_file.replace(".pdf", "")

    # Verify the Set Name
    print(f"Set Name: {set_name}")

    # Open the PDF file
    reader = PdfReader(pdf_path)

    # Extract the number of pages in the PDF file
    num_pages = len(reader.pages)

    # Verify the number of pages in the PDF file
    print(f"Number of pages: {num_pages}")

    # Extract the text from each page in the PDF file
    pdf_text = []
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        pdf_text.append(page.extract_text())

    # Verify the PDF text
    print(f"Extracted text: {pdf_text}")

    # Define the pattern to match 
    pattern = r"\d+\s.*"

    # Find all the match in the PDF text
    matches = re.findall(pattern, pdf_text[0])

    # Verify the matches
    print(f"Matches: {matches}")

    # Define the pattern to split the matches
    pattern = r"\s"

    # Split the matches using the pattern and limit to 1 split
    split_matches = [re.split(pattern, match, 1) for match in matches]

    # Verify the split matches
    print(f"Split matches: {split_matches}")

    # Load the split matches into a DataFrame
    df = pd.DataFrame(split_matches, columns=["card_number", "card_name"])

    # Add the Set Name to the DataFrame
    df["set_name"] = set_name

    # Verify the DataFrame by counting the number of rows
    print(f"DataFrame row count: {len(df)}")

    # Create the path to save the extracted data
    csv_file = f"data/processed/disney_lorcana/sets/{set_name}.csv"

    # Verify the CSV file path
    print(f"Saving CSV file: {csv_file}")

    # Create CSV directory if it does not exist
    os.makedirs(os.path.dirname(csv_file), exist_ok=True)

    # Verify the CSV directory was created
    print(f"Created CSV directory: {os.path.dirname(csv_file)}")
    
    # Save the extracted data to a CSV file
    df.to_csv(csv_file, index=False)

    # Verify the CSV file was saved
    print(f"Saved CSV file: {csv_file}")

Processing PDF file: data/raw/disney_lorcana/set_checklist/into-the-inklands.pdf
Set Name: into-the-inklands
Number of pages: 1
Extracted text: ["AMBER EMERALD\nSAPPHIRE35 Alice - Tea Alchemist\n36 Chernabog's Followers - Creatures of Evil\n37 Diablo - Faithful Pet\n38 Genie - Supportive Friend\n39 Hydros - Ice Titan\n40 Iago - Pretty Polly\n41 Jafar - Lamp Thief\n42 Jafar - Striking Illusionist \n43 Lena Sabrewing - Rebellious Teenager\n44 Magic Broom - Dancing Duster\n45 Magic Broom - Swift Cleaner\n46 Magic Broom - The Big Sweeper\n47 Magic Carpet - Flying Rug\n48 Magica De Spell - Ambitious Witch \n49 Magica De Spell - The Midas Touch\n50 Magica De Spell - Thieving Sorceress\n51 Maleficent - Mistress of All Evil\n52 Mama Odie - Voice of Wisdom\n53 Pua - Potbellied Buddy\n54 Rafiki - Mystical Fighter\n55 Stratos - Tornado Titan\n56 The Firebird - Force of Destruction\n57 The Queen - Hateful Rival\n58 Treasure Guardian - Protector of the Cave\n59 Ursula - Sea Witch\n60 Bestow a Gift\

# Find All Disney Lorcana CSV Files
Find All PDF Files in `data/processed/disney_lorcana/sets`

In [81]:
# Assign the directory where the PDF files are stored
csv_directory = "data/processed/disney_lorcana/sets"

# List all the CSV files in the directory
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]

# Verify the PDF files were found
print(f"Found CSV files: {csv_files}")

Found CSV files: ['into-the-inklands.csv', 'rise-of-the-floodborn.csv', 'shimmering-skies.csv', 'the-first-chapter.csv', 'ursulas-return.csv']


# Data Preprocessing
Clean and preprocess the data, handling any missing or inconsistent values.

In [None]:
# Load and Process Each CSV File
for csv_file in csv_files:
    # Create the path to the CSV file
    csv_path = os.path.join(csv_directory, csv_file)

    # Verify the PDF file path
    print(f"Processing CSV file: {csv_path}")

    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_path)

    # Remove Ink Types from the Card Name
    ink_types = ['AMBER', 'AMETHYST', 'EMERALD', 'RUBY', 'SAPPHIRE', 'STEEL']
    for ink_type in ink_types:
        df['card_name'] = df['card_name'].apply(lambda x: x.replace(ink_type, '').strip())
        print(f'Removed {ink_type} from the card name')

    # Remove `TOTAL` from the Card Name
    df['card_name'] = df['card_name'].apply(lambda x: x.replace('TOTAL', '').strip())
    print('Removed TOTAL from the card name')

    # Save the updated DataFrame to the CSV file
    df.to_csv(csv_path, index=False)

    # Verify the CSV file was saved
    print(f"Saved CSV file: {csv_path}")

        

Processing CSV file: data/processed/disney_lorcana/sets/into-the-inklands.csv
Processing CSV file: data/processed/disney_lorcana/sets/rise-of-the-floodborn.csv
Processing CSV file: data/processed/disney_lorcana/sets/shimmering-skies.csv
Processing CSV file: data/processed/disney_lorcana/sets/the-first-chapter.csv
Processing CSV file: data/processed/disney_lorcana/sets/ursulas-return.csv


# Process Extracted Data
Process and analyze the extracted text data.

In [85]:
# Process Extracted Data

# Convert the extracted text into a list of lines
lines = extracted_text.split('\n')

# Create a DataFrame from the list of lines
df = pd.DataFrame(lines, columns=['Text'])

# Display the first few rows of the DataFrame
df.head()

# Perform basic text analysis
# Count the number of words in each line
df['Word Count'] = df['Text'].apply(lambda x: len(x.split()))

# Display the DataFrame with the word count
df.head()

# Calculate the total number of words in the extracted text
total_words = df['Word Count'].sum()

# Display the total number of words
print(f"Total number of words: {total_words}")

# Identify the most common words
from collections import Counter

# Flatten the list of words
all_words = [word for line in df['Text'] for word in line.split()]

# Count the frequency of each word
word_counts = Counter(all_words)

# Display the 10 most common words
common_words = word_counts.most_common(10)
print("10 most common words:", common_words)

NameError: name 'extracted_text' is not defined