<a href="https://colab.research.google.com/github/asifdotpy/BinanceFeedTwitterBot/blob/main/Copy_of_prm_letter_notion_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Display structure

### Subtask:
Display the mapped directory structure.


In [None]:
# Install the unrar tool
!apt-get install unrar

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unrar is already the newest version (1:6.1.5-1ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


## First structured script


In [None]:
import os
import subprocess
import zipfile

def notion_export():
    """
    Identifies the latest zip file in the Google Drive PRM directory.
    Assumes Google Drive is mounted at /content/drive.
    """
    drive_dir = "/content/drive/MyDrive/PRM/"
    try:
        files = os.listdir(drive_dir)
        if not files:
            print(f"No files found in {drive_dir}")
            return None

        latest_file = None
        latest_time = 0

        for file in files:
            file_path = os.path.join(drive_dir, file)
            if file.endswith('.zip'): # Ensure we only consider zip files
                mtime = os.path.getmtime(file_path)
                if mtime > latest_time:
                    latest_time = mtime
                    latest_file = file_path

        if latest_file:
            print(f"Latest zip file found: {latest_file}")
            return latest_file
        else:
            print(f"Could not determine the latest zip file in {drive_dir}")
            return None

    except FileNotFoundError:
        print(f"Error: Directory not found at {drive_dir}. Make sure Google Drive is mounted and the path is correct.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

notion_export()

Latest zip file found: /content/drive/MyDrive/PRM/d426b22c-7b6a-43c1-b9d9-97fb7604ede3_Export-d5ac4479-6dcd-4f90-9223-47cad57998ae.zip


'/content/drive/MyDrive/PRM/d426b22c-7b6a-43c1-b9d9-97fb7604ede3_Export-d5ac4479-6dcd-4f90-9223-47cad57998ae.zip'

## Extract nested zip

In [None]:
def extract_nested_zip(zip_file_path, extraction_base_dir):
    """
    Extracts a zip file, then checks if it contains another zip file
    and extracts that as well into a new subdirectory.
    """
    if not zip_file_path or not os.path.exists(zip_file_path):
        print(f"Error: ZIP file not found at {zip_file_path}")
        return None

    first_extraction_dir = os.path.join(extraction_base_dir, "first_level_extract")
    os.makedirs(first_extraction_dir, exist_ok=True)

    print(f"Extracting {zip_file_path} to {first_extraction_dir}...")
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(first_extraction_dir)
        print("First-level ZIP file extracted successfully.")
    except Exception as e:
        print(f"Error during first-level ZIP extraction: {e}")
        return None

    # Check for nested zip files
    nested_zip_file = None
    for root, _, files in os.walk(first_extraction_dir):
        for file in files:
            if file.endswith('.zip'):
                nested_zip_file = os.path.join(root, file)
                break
        if nested_zip_file:
            break

    if nested_zip_file:
        second_extraction_dir = os.path.join(extraction_base_dir, "second_level_extract")
        os.makedirs(second_extraction_dir, exist_ok=True)
        print(f"Found nested ZIP file: {nested_zip_file}. Extracting to {second_extraction_dir}...")
        try:
            with zipfile.ZipFile(nested_zip_file, 'r') as zip_ref:
                zip_ref.extractall(second_extraction_dir)
            print("Nested ZIP file extracted successfully.")
            return second_extraction_dir
        except Exception as e:
            print(f"Error during nested ZIP extraction: {e}")
            return None
    else:
        print("No nested ZIP file found.")
        return first_extraction_dir # Return the first level if no nested zip

extract_nested_zip(latest_zip_file, "/content")

Extracting /content/drive/MyDrive/PRM/d426b22c-7b6a-43c1-b9d9-97fb7604ede3_Export-d5ac4479-6dcd-4f90-9223-47cad57998ae.zip to /content/first_level_extract...
First-level ZIP file extracted successfully.
Found nested ZIP file: /content/first_level_extract/Export-d5ac4479-6dcd-4f90-9223-47cad57998ae-Part-1.zip. Extracting to /content/second_level_extract...
Nested ZIP file extracted successfully.


'/content/second_level_extract'

## Get all .md files

In [1]:
import os

def get_md_files(startpath):
    """Recursively finds all .md files in a directory."""
    md_files = []
    for root, _, files in os.walk(startpath):
        for file in files:
            if file.endswith(".md"):
                md_files.append(os.path.join(root, file))
    return md_files

# Assuming the nested zip was extracted to /content/second_level_extract
extraction_dir = "/content/second_level_extract"
md_files = get_md_files(extraction_dir)

print("List of .md files:")
for file_path in md_files:
    print(file_path)

List of .md files:


## Categorize the .md files

In [None]:
def categorize_md_file(filename):
    """Categorizes an MD file based on its filename."""
    filename_lower = filename.lower()
    categories = []

    # Updated categorization based on observed filenames and common patterns

    # Legal/Case related keywords
    if any(keyword in filename_lower for keyword in ['case', 'arrest', 'abduct', 'detention', 'complaint', 'issue', 'attacked', 'death', 'kurban', 'legal']):
        categories.append('legal/case related')

    # Personal/Account related keywords
    if any(keyword in filename_lower for keyword in ['account', 'loan', 'aziz', 'binance', 'archive', 'accessories', 'personal']):
        categories.append('personal/account related')

    # Project/Work related keywords (including AI, prompts, development)
    if any(keyword in filename_lower for keyword in ['project', 'work', 'lab', 'system prompts', 'ai prompts', 'talent acquisition', 'job platform', 'interview question', 'arc dev', 'chatbot', 'ai', 'research', 'proposal', 'tool integration', 'feature request', 'design', 'build', 'roadmap', 'white paper', 'game idea', 'prompt idea', 'documentation', 'marketing strategy', 'data room', 'github', 'kanban', 'task', 'wireframes', 'tests', 'visuals', 'iconography', 'technical spec', 'user feedback', 'optimize', 'brainstorm', 'deploy', 'validate', 'implement', 'integrate', 'user testing', 'website', 'app', 'employee training', 'user support']):
        categories.append('project/work related')

    # Communication/Social Media related keywords
    if any(keyword in filename_lower for keyword in ['facebook', 'social media', 'bbc', 'communication', 'advertisement', 'marketing']):
        categories.append('communication/social media related')

    # Calendar/Event related keywords
    if any(keyword in filename_lower for keyword in ['calendar', '@june', '@may', 'event']):
        categories.append('calendar/event related')

    # Bihari Community related keywords
    if any(keyword in filename_lower for keyword in ['bihari', 'community', 'forgotten ones']):
        categories.append('bihari community')

    # Catch-all for files not matching specific categories
    if not categories:
        categories.append('miscellaneous')
    return categories

In [2]:
# Assuming md_files is populated from the previous step and categorize_md_file is defined

categorized_files = {}

for file_path in md_files:
    filename = os.path.basename(file_path)
    categories = categorize_md_file(filename)
    for category in categories:
        if category not in categorized_files:
            categorized_files[category] = []
        categorized_files[category].append(file_path)

# Print the categorized files for review
for category, files in categorized_files.items():
    print(f"\nCategory: {category}")
    if files:
        for file_path in files:
            print(f"- {file_path}")
    else:
        print("- No files in this category.")

# --- Main Plan Execution ---

In [3]:
# Step 1: Find and extract the latest Notion export ZIP file (potentially nested)
print("### Step 1: Locating and Extracting Notion Export ###")
latest_zip = notion_export()
if not latest_zip:
    print("Failed to find a Notion export ZIP file. Exiting.")
    exit()

base_extraction_dir = "/content/notion_extracted_data"
final_extraction_dir = extract_nested_zip(latest_zip, base_extraction_dir)

if not final_extraction_dir or not os.path.exists(final_extraction_dir):
    print("Failed to extract Notion data. Exiting.")
    exit()

print(f"\nFinal Notion data extracted to: {final_extraction_dir}")
print("---")

### Step 1: Locating and Extracting Notion Export ###


NameError: name 'notion_export' is not defined

## Identify files for removal based on criteria

In [4]:
# -------------------------------
# Step 2: Identify files for removal based on criteria
# -------------------------------
print("### Step 2: Identifying Files for Deletion ###")

specific_files_to_remove = [
    'Blockchain articles 1c36566cd850809cb130c33224cd2edf.md',
    'AI in Talent Acquisition Beyond the Buzzwords to B 1fb6566cd85080559b9eeb0a6daeb481.md',
    'Aziz loan payment 2196566cd85080cbab52d1997ea18c7d.md',
    'Facebook post 21b6566cd85080259b5ae7e937d9a3cd.md',
    '🛡️ Managing Reactions to Sensitive Social Media Ad 2066566cd850809ca4adc083ba40a19b.md',
    'প্রজেক্ট আশা 1da6566cd8508059ab3ce5f854f9a42a.md' # Using full filename for exact match
]

image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg']
video_extensions = ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.webm']

files_to_delete = set()
all_md_files_before_deletion = get_md_files(final_extraction_dir)

# Identify files to delete by name and type
for root, _, files in os.walk(final_extraction_dir):
    for file in files:
        file_path = os.path.join(root, file)
        filename_only = os.path.basename(file_path)
        file_extension = os.path.splitext(file)[1].lower()

        if filename_only in specific_files_to_remove:
            files_to_delete.add(file_path)
        elif file_extension in image_extensions or file_extension in video_extensions:
            files_to_delete.add(file_path)
        elif not file.endswith(".md"): # Delete all non-md files except images/videos already covered
            files_to_delete.add(file_path)

# Identify .md files in 'personal/account related' and 'project/work related' categories for deletion
for file_path in all_md_files_before_deletion:
    filename = os.path.basename(file_path)
    categories = categorize_md_file(filename)
    if 'personal/account related' in categories or 'project/work related' in categories:
        files_to_delete.add(file_path)

print("\n--- Proposed Deletion Plan ---")
print("The following files and types will be removed:")
print("  - All image files (extensions: .jpg, .jpeg, .png, .gif, .bmp, .tiff, .webp, .svg)")
print("  - All video files (extensions: .mp4, .avi, .mov, .wmv, .flv, .mkv, .webm)")
print("  - Any file that is not an .md file (unless it's an image/video already listed).")
print("  - Specific markdown files:")
for f in specific_files_to_remove:
    print(f"    - {f}")
print("  - All .md files categorized as 'personal/account related' or 'project/work related'.")

print("\nList of individual files identified for deletion:")
if files_to_delete:
    for f in sorted(list(files_to_delete)):
        print(f"  - {f.replace(final_extraction_dir, '')}")
else:
    print("  - No additional individual files identified for deletion based on the criteria.")
print("---")

### Step 2: Identifying Files for Deletion ###


NameError: name 'final_extraction_dir' is not defined

## Step 3: Execute Deletion

In [None]:
user_confirmation = input("\nDo you review this deletion plan and confirm to proceed? (yes/no): ").lower()

if user_confirmation == 'yes':
    print("\n### Step 3: Executing Deletion ###")
    deleted_count = 0
    for file_path in files_to_delete:
        try:
            os.remove(file_path)
            deleted_count += 1
            # print(f"Deleted: {file_path.replace(final_extraction_dir, '')}")
        except OSError as e:
            print(f"Error deleting {file_path.replace(final_extraction_dir, '')}: {e}")
    print(f"Successfully deleted {deleted_count} files.")
    print("---")


Do you review this deletion plan and confirm to proceed? (yes/no): yes

### Step 3: Executing Deletion ###
Successfully deleted 360 files.
---


##

## Categorize remaining .md files

In [5]:
print("\n### Step 4: Categorizing Remaining .md Files ###")
remaining_md_files = get_md_files(final_extraction_dir)
categorized_final_md_files = {
    'legal/case related': [],
    'communication/social media related': [],
    'calendar/event related': [],
    'bihari community': [],
    'miscellaneous': []
}

for file_path in remaining_md_files:
    if file_path not in files_to_delete: # Only categorize if the file was NOT deleted
        filename = os.path.basename(file_path)
        categories = categorize_md_file(filename)
        for category in categories:
            # Only add to categories we want to keep
            if category in categorized_final_md_files:
                categorized_final_md_files[category].append(file_path)
            elif category not in ['personal/account related', 'project/work related']:
                # If a file doesn't fit explicit categories, and isn't one we removed, put in miscellaneous
                if 'miscellaneous' not in categorized_final_md_files:
                    categorized_final_md_files['miscellaneous'] = []
                categorized_final_md_files['miscellaneous'].append(file_path)


print("\n--- Final Categorization of .md Files ---")
for category, files in categorized_final_md_files.items():
    if files:
        print(f"\nCategory: {category} ({len(files)} files)")
        for file_path in sorted(files):
            print(f"  - {os.path.basename(file_path)}")
    else:
        print(f"\nCategory: {category} (0 files)")
print("---")


### Step 4: Categorizing Remaining .md Files ###


NameError: name 'final_extraction_dir' is not defined

**TODO**: Manually review the files in the 'miscellaneous' category and assign them to appropriate categories or create new ones if necessary.