# Code for Extracting a ZIP File

In [4]:
import zipfile
import os

# First, let's verify the file location
def extract_zip(zip_filename, download_dir, extract_to):
    # Construct full path to zip file
    zip_path = os.path.join(download_dir, zip_filename)
    
    # Check if file exists
    if not os.path.exists(zip_path):
        print(f"ZIP file not found at: {zip_path}")
        # List files in download directory
        print("\nFiles in download directory:")
        for file in os.listdir(download_dir):
            print(file)
        return False
    
    # Create extraction directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)
    
    # Extract the ZIP file
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Successfully extracted to: {extract_to}")
        return True
    except zipfile.BadZipFile:
        print("The file is not a valid ZIP file")
        return False
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return False

# Usage
download_dir = r"C:\Users\Nexgen\Downloads"  # Main downloads directory
zip_filename = "unsplash-research-dataset-lite-latest.zip"  # ZIP file name
extract_to = r"C:\Users\Nexgen\Downloads\genAI_project_dataset"  # Extraction directory

success = extract_zip(zip_filename, download_dir, extract_to)

Successfully extracted to: C:\Users\Nexgen\Downloads\genAI_project_dataset


# Code for Listing Files and Folders

In [5]:
import os

def list_files_and_folders(directory):
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return

    print(f"Contents of directory: {directory}")
    for root, dirs, files in os.walk(directory):
        # Print subdirectories
        for dir_name in dirs:
            print(f"[Folder] {os.path.join(root, dir_name)}")
        # Print files
        for file_name in files:
            print(f"[File] {os.path.join(root, file_name)}")

# Usage
unzipped_dir = r"C:\Users\Nexgen\Downloads\genAI_project_dataset"  # Replace with the unzipped directory path
list_files_and_folders(unzipped_dir)


Contents of directory: C:\Users\Nexgen\Downloads\genAI_project_dataset
[Folder] C:\Users\Nexgen\Downloads\genAI_project_dataset\__MACOSX
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\collections.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\colors.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\conversions.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\DOCS.md
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\keywords.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\photos.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\README.md
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\TERMS.md
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\__MACOSX\._collections.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\__MACOSX\._colors.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\__MACOSX\._conversions.tsv000
[File] C:\Users\Nexgen\Downloads\genAI_project_dataset\__MACOSX\

# Code for Previewing .TSV Files
This Python script defines a function `preview_tsv_files` to preview the first five rows of multiple `.tsv000` files from a specified directory. It uses `pandas` for data loading and displays the top rows for each file.


In [6]:
import pandas as pd
import os

def preview_tsv_files(directory, file_list):
    for file_name in file_list:
        file_path = os.path.join(directory, file_name)
        if file_name.endswith(".tsv000"):
            try:
                # Load the TSV file
                print(f"\nPreview of {file_name}:")
                df = pd.read_csv(file_path, sep="\t", nrows=5)  # Load first 5 rows
                print(df.head())
            except Exception as e:
                print(f"Error loading {file_name}: {str(e)}")

# List of TSV files
tsv_files = [
    "collections.tsv000",
    "colors.tsv000",
    "conversions.tsv000",
    "keywords.tsv000",
    "photos.tsv000",
]

# Usage
preview_tsv_files(r"C:\Users\Nexgen\Downloads\genAI_project_dataset", tsv_files)



Preview of collections.tsv000:
      photo_id collection_id    collection_title          photo_collected_at
0  --2IBUMom1I       1230101              Travel  2017-09-27 11:24:17.575047
1  --2IBUMom1I       9832457            business  2020-04-04 14:26:10.506402
2  --2IBUMom1I       2143051     Travel / Places  2018-05-22 23:20:05.898545
3  --2IBUMom1I   FBJEaBSjBvg            Settings  2022-06-04 03:56:40.892078
4  --2IBUMom1I        162470  Majestical Sunsets  2016-03-15 17:04:25.089589

Preview of colors.tsv000:
      photo_id     hex  red  green  blue         keyword  ai_coverage  \
0  XDPk8ndzNho  5B534C   91     83    76  darkolivegreen     0.065067   
1  IfL3QovlAbI  371511   55     21    17           black     0.105533   
2  GKzgF32piaE  8ACCD5  138    204   213         skyblue     0.044867   
3  T5WR9adosj8  A59A99  165    154   153        darkgray     0.050800   
4  T5WR9adosj8  7F7575  127    117   117            gray     0.050533   

   ai_score  
0  0.030752  
1  0.203291 

# Code for Chunk-wise Merging of Large TSV Files
This Python script processes large `.tsv000` files in chunks and performs a merge operation. It uses `pandas` for efficient data handling, especially useful when dealing with files that exceed memory capacity.


In [9]:
import pandas as pd
import os

# Define paths to the data files
data_dir = r"C:\Users\Nexgen\Downloads\genAI_project_dataset" 
photos_file = os.path.join(data_dir, "photos.tsv000")
collections_file = os.path.join(data_dir, "collections.tsv000")

# Define the chunk size (adjust based on your system's capacity)
chunk_size = 100000  # Adjust based on your system's memory

# Output file for merged data
output_file = os.path.join(data_dir, "merged_data.csv")

# Process the photos file in chunks
with pd.read_csv(photos_file, sep="\t", chunksize=chunk_size) as photos_reader:
    for photos_chunk in photos_reader:
        # Read the collections file for each chunk
        collections = pd.read_csv(collections_file, sep="\t")
        
        # Merge the chunk with the collections data
        chunk_merged = photos_chunk.merge(collections, on="photo_id", how="left")
        
        # Append the merged chunk to the output file
        chunk_merged.to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))


# Code for Efficient Large-Scale Data Merging Using Dask
This Python script demonstrates how to use Dask, a parallel computing library, to handle large `.tsv000` files for merging operations. Dask provides an efficient way to work with large datasets that may not fit into memory.


In [15]:
import dask.dataframe as dd
import os
import logging

# Enable verbose logging for Dask
logging.basicConfig(level=logging.DEBUG)

# Specify the directory where your TSV files are located
data_dir = r"C:\Users\Nexgen\Downloads\genAI_project_dataset"

# Construct full file paths with long path prefix (if needed)
photos_path = r"\\?\C:\Users\Nexgen\Downloads\genAI_project_dataset\photos.tsv000"
collections_path = r"\\?\C:\Users\Nexgen\Downloads\genAI_project_dataset\collections.tsv000"
colors_path = r"\\?\C:\Users\Nexgen\Downloads\genAI_project_dataset\colors.tsv000"

# Define dtypes for problematic columns
dtypes = {
    'exif_aperture_value': 'object',
    'exif_focal_length': 'object'
}

# Load the TSV files using Dask with specified dtypes
try:
    photos = dd.read_csv(
        photos_path,
        delimiter="\t",
        assume_missing=True,
        blocksize="16MB",  # Reduced blocksize to avoid memory issues
        dtype=dtypes
    )
    
    collections = dd.read_csv(
        collections_path,
        delimiter="\t",
        assume_missing=True,
        blocksize="16MB"  # Reduced blocksize to avoid memory issues
    )
    
    colors = dd.read_csv(
        colors_path,
        delimiter="\t",
        assume_missing=True,
        blocksize="16MB"  # Reduced blocksize to avoid memory issues
    )

    # Perform the merge operation
    merged_df = photos.merge(collections, on="photo_id", how="left")
    merged_df = merged_df.merge(colors, on="photo_id", how="left")

    # Compute the result using 'threads' scheduler for better performance
    result = merged_df.compute(scheduler='threads')

    # Save the result
    output_path = os.path.join(data_dir, "merged_result.csv")
    result.to_csv(output_path, index=False)

    print(f"Successfully saved merged result to: {output_path}")
    print("\nFirst few rows of the merged data:")
    print(result.head())

except Exception as e:
    print(f"An error occurred: {str(e)}")
    
    # Print memory usage information
    import psutil
    process = psutil.Process()
    print("\nCurrent memory usage:")
    print(f"Memory used: {process.memory_info().rss / 1024 / 1024:.2f} MB")

    # If there's an error, let's examine the structure of the files
    print("\nExamining file structure:")
    import pandas as pd
    
    try:
        # Read just a few rows to examine structure
        print("\nPhotos file structure:")
        photos_sample = pd.read_csv(photos_path, delimiter="\t", nrows=5)
        print(photos_sample.dtypes)
        
        print("\nCollections file structure:")
        collections_sample = pd.read_csv(collections_path, delimiter="\t", nrows=5)
        print(collections_sample.dtypes)
        
        print("\nColors file structure:")
        colors_sample = pd.read_csv(colors_path, delimiter="\t", nrows=5)
        print(colors_sample.dtypes)
    except Exception as e:
        print(f"Error while examining file structure: {str(e)}")


DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/photos.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/colors.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000
DEBUG:fsspec.local:open file: //?/C:/Users/Nexgen/Downloads/genAI_project_dataset/collections.tsv000


Successfully saved merged result to: C:\Users\Nexgen\Downloads\genAI_project_dataset\merged_result.csv

First few rows of the merged data:
      photo_id                                photo_url  \
0  3OeUD6_-I4I  https://unsplash.com/photos/3OeUD6_-I4I   
1  3OeUD6_-I4I  https://unsplash.com/photos/3OeUD6_-I4I   
2  3OeUD6_-I4I  https://unsplash.com/photos/3OeUD6_-I4I   
3  3OeUD6_-I4I  https://unsplash.com/photos/3OeUD6_-I4I   
4  3OeUD6_-I4I  https://unsplash.com/photos/3OeUD6_-I4I   

                                     photo_image_url  \
0  https://images.unsplash.com/photo-1544918334-2...   
1  https://images.unsplash.com/photo-1544918334-2...   
2  https://images.unsplash.com/photo-1544918334-2...   
3  https://images.unsplash.com/photo-1544918334-2...   
4  https://images.unsplash.com/photo-1544918334-2...   

           photo_submitted_at photo_featured  photo_width  photo_height  \
0  2018-12-16 00:02:59.709851              t       8192.0        5461.0   
1  2018-12-16 00:02