In [3]:
!pip install jupyter-dash dash plotly pillow

from jupyter_dash import JupyterDash
from dash import html, dcc, Input, Output, State
import plotly.express as px
import plotly.graph_objects as go
from PIL import Image
import numpy as np
import pandas as pd  # Import pandas to handle CSV operations
import IPython.display as display

# Load the image
image_path = '/content/img/img.jpg'
img = Image.open(image_path)
img_array = np.array(img)
height, width = img_array.shape[:2]

# Convert image to Plotly image format
fig = px.imshow(img_array)

# Set initial rectangle coordinates
initial_x0, initial_y0 = 140, 164
initial_x1, initial_y1 = 933, 1424

fig.update_layout(
    dragmode="drawrect",
    newshape=dict(line_color="red"),
    margin=dict(l=0, r=0, t=0, b=0),
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    shapes=[go.layout.Shape(
        type="rect",
        x0=initial_x0, y0=initial_y0,
        x1=initial_x1, y1=initial_y1,
        line=dict(color="red")
    )]
)

# Initialize the Dash app
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H1("Interactive Bounding Box Selector"),
    dcc.Graph(
        id='image-graph',
        figure=fig,
        style={'height': '100vh', 'width': '100vw', 'margin': 'auto'},
        config={
            'displayModeBar': False,  # Hides the mode bar for cleaner display
        }
    ),
    html.Div([
        html.Label("Width"),
        dcc.Slider(id='width-slider', min=10, max=width, value=initial_x1 - initial_x0, step=1),
        html.Label("Height"),
        dcc.Slider(id='height-slider', min=10, max=height, value=initial_y1 - initial_y0, step=1),
        html.Label("X position"),
        dcc.Slider(id='x-slider', min=0, max=width, value=initial_x0, step=1),
        html.Label("Y position"),
        dcc.Slider(id='y-slider', min=0, max=height, value=initial_y0, step=1)
    ], style={'width': '50%', 'margin': 'auto'}),
    html.Button("Validate Bounding Box", id="validate-button", n_clicks=0),
    html.Div(id='output')
])

@app.callback(
    Output('image-graph', 'figure'),
    Input('width-slider', 'value'),
    Input('height-slider', 'value'),
    Input('x-slider', 'value'),
    Input('y-slider', 'value')
)
def update_rectangle(width_value, height_value, x_value, y_value):
    x_value = min(x_value, width - width_value)
    y_value = min(y_value, height - height_value)
    fig.update_layout(
        shapes=[go.layout.Shape(
            type="rect",
            x0=x_value, y0=y_value,
            x1=x_value + width_value, y1=y_value + height_value,
            line=dict(color="red")
        )]
    )
    return fig

@app.callback(
    Output('output', 'children'),
    Input('validate-button', 'n_clicks'),
    State('width-slider', 'value'),
    State('height-slider', 'value'),
    State('x-slider', 'value'),
    State('y-slider', 'value')
)
def validate_bounding_box(n_clicks, width_value, height_value, x_value, y_value):
    if n_clicks > 0:
        # Calculate the bounding box coordinates
        upper_left = (x_value, y_value)
        upper_right = (x_value + width_value, y_value)
        lower_left = (x_value, y_value + height_value)
        lower_right = (x_value + width_value, y_value + height_value)

        # Create a DataFrame with the coordinates
        bounding_box_df = pd.DataFrame([
            [upper_left[0], upper_left[1]],     # Row 1: upper-left (x, y)
            [upper_right[0], upper_right[1]],   # Row 2: upper-right (x, y)
            [lower_left[0], lower_left[1]],     # Row 3: lower-left (x, y)
            [lower_right[0], lower_right[1]]    # Row 4: lower-right (x, y)
        ])

        # Save the DataFrame as a CSV file without header and index
        csv_path = '/content/bounding_box_coordinates.csv'
        bounding_box_df.to_csv(csv_path, index=False, header=False)

        return html.Div([
            html.H4("Bounding box coordinates saved to CSV."),
            html.P(f"File saved to: {csv_path}")
        ])
    return ""

# Clear any existing outputs
display.clear_output(wait=True)

# Run the app
app.run_server(mode='inline')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
!pip install Pillow
!pip install pdf2image
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 1s (356 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123594 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [9]:
import os
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError

# Chemin du dossier contenant les PDFs
pdf_folder = "/content/pdf"

# Chemin du dossier de sortie pour les images JPG
output_folder = "/content/img"

# Créer le dossier de sortie s'il n'existe pas
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Parcourir tous les fichiers du dossier PDF
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)

        try:
            # Convertir le PDF en images
            pages = convert_from_path(pdf_path)

            # Sauvegarder chaque page comme un fichier JPG
            for i, page in enumerate(pages):
                jpg_file = f"{os.path.splitext(pdf_file)[0]}_page_{i+1}.jpg"
                jpg_path = os.path.join(output_folder, jpg_file)
                page.save(jpg_path, "JPEG")

            print(f"Conversion terminée pour {pdf_file}")
        except PDFPageCountError:
            print(f"Erreur lors de la conversion de {pdf_file}: PDF non valide ou corrompu")
        except Exception as e:
            print(f"Erreur inattendue lors de la conversion de {pdf_file}: {str(e)}")

print("Toutes les conversions sont terminées.")

Conversion terminée pour page_i.pdf
Toutes les conversions sont terminées.


In [10]:
import os
import pandas as pd
from PIL import Image

# Load the CSV file
bounding_box_path = '/content/bounding_box_coordinates.csv'
bounding_box_df = pd.read_csv(bounding_box_path, header=None)

# Extract the coordinates
top_left_x = bounding_box_df.iloc[0, 0]
top_left_y = bounding_box_df.iloc[0, 1]
top_right_x = bounding_box_df.iloc[1, 0]
top_right_y = bounding_box_df.iloc[1, 1]
lower_left_x = bounding_box_df.iloc[2, 0]
lower_left_y = bounding_box_df.iloc[2, 1]
bottom_right_x = bounding_box_df.iloc[3, 0]
bottom_right_y = bounding_box_df.iloc[3, 1]

# Determine the crop box in the format (left, upper, right, lower)
crop_box = (top_left_x, top_left_y, bottom_right_x, bottom_right_y)

def crop_image(input_path, output_path, crop_box):
    with Image.open(input_path) as img:
        # Crop the image using the bounding box
        cropped_img = img.crop(crop_box)
        # Save the cropped image to the output path
        cropped_img.save(output_path)

# Set the input and output directories
input_dir = '/content/img'
output_dir = '/content/cropped_images'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through all the JPG files in the input directory
for img_file in os.listdir(input_dir):
    if img_file.endswith('.jpg'):
        input_path = os.path.join(input_dir, img_file)
        output_path = os.path.join(output_dir, img_file)

        # Crop the image and save the result
        crop_image(input_path, output_path, crop_box)

print(f"Cropping complete. Cropped images are saved in {output_dir}.")

Cropping complete. Cropped images are saved in /content/cropped_images.


In [9]:
from google.colab import drive
drive.mount('/content/experimentation')

Mounted at /content/experimentation


In [23]:
import os
import shutil

# Source directory in Colab
source_dir = '/content/cropped_images'

# Destination directory in Google Drive
# Change 'MyDrive/destination_folder' to your preferred path in Google Drive
dest_dir = '/content/drive/MyDrive/here'

# Create the destination directory if it doesn't exist
os.makedirs(dest_dir, exist_ok=True)

# Iterate through all files in the source directory
for filename in os.listdir(source_dir):
    source_file = os.path.join(source_dir, filename)
    dest_file = os.path.join(dest_dir, filename)

    # Copy each file to the destination directory
    shutil.copy2(source_file, dest_file)
    print(f"Uploaded: {filename}")

print("All files have been uploaded to Google Drive.")

Uploaded: page_i_page_1.jpg
Uploaded: img.jpg
Uploaded: page_i_page_2.jpg
All files have been uploaded to Google Drive.
