<h1 style="color:#FF00A6; font-weight:bold; font-family:sans-serif;">
  Snappfood Data Analyst Task
</h1>

<h3 style="color:#F9FAFB; font-family:sans-serif;">
  Task 3 – <em>OCR Verification</em>
</h3>

<p style="color:#EBEDF0; font-size:14px; font-family:sans-serif;">
  Extract restaurant names and contact numbers from storefront images using OCR, and verify consistency with structured records.
</p>
<br>
<hr style="color:#FF00A6;">

# Imports & Setup

In [51]:
# Import the importlib module to check if the required libraries are installed
import importlib.util

# List of required libraries
required_libraries = ['requests', 'pandas', 'tqdm']

# Install the required libraries if they are not already installed
for lib in required_libraries:
    if importlib.util.find_spec(lib) is None:
        %pip install {lib}

import requests
import pandas as pd
import os
import re
from tqdm import tqdm


# Prepare Dataset

## Downlaod from Google Sheets

In [32]:
def download_sheet_as_csv(spreadsheet_id, gid, output_path):
    url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export"
    params = {
        "format": "csv",
        "gid": gid
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        with open(output_path, "wb") as f:
            f.write(response.content)
        print(f"Sheet downloaded and saved to '{output_path}'")
    else:
        print(f"Failed to download sheet. Status code: {response.status_code}")

# Replace with your actual spreadsheet ID and sheet GID
SPREADSHEET_ID = "1ic4RLD_r4ASfl7nRk2ctagH_98j2sPDYN7IB4n6n9e8"
GID = "912424133"  # GID of Task 3 sheet
OUTPUT_FILE = "./task3_dataset.csv"

download_sheet_as_csv(SPREADSHEET_ID, GID, OUTPUT_FILE)

# Correct the names and save the dataset again
df = pd.read_csv("./task3_dataset.csv", encoding="utf-8-sig")
df.to_csv("task3_dataset.csv", index=False, encoding="utf-8-sig")


Sheet downloaded and saved to './task3_dataset.csv'


## Image Download

In [33]:
# Config
DATA_PATH = "task3_dataset.csv"
IMAGE_DIR = "images"
os.makedirs(IMAGE_DIR, exist_ok=True)

# Load Dataset
df = pd.read_csv(DATA_PATH, encoding="utf-8-sig")

# Utility: Extract File ID from Google Drive Link
def extract_drive_file_id(link):
    match = re.search(r"/d/([a-zA-Z0-9_-]+)", link)
    return match.group(1) if match else None

# Download Function
def download_drive_image(file_id, output_path):
    """Downloads a file from Google Drive using file ID."""
    URL = "https://drive.google.com/uc?export=download"

    session = requests.Session()
    response = session.get(URL, params={"id": file_id}, stream=True)

    if response.status_code == 200:
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(1024):
                if chunk:
                    f.write(chunk)
        return True
    else:
        return False

# Process and Download All Images
image_paths = []

print("Downloading images...")

for idx, row in tqdm(df.iterrows(), total=len(df)):
    link = row['Photo']
    file_id = extract_drive_file_id(link)

    if file_id:
        filename = f"{idx}.jpg"
        save_path = os.path.join(IMAGE_DIR, filename)

        success = download_drive_image(file_id, save_path)
        image_paths.append(save_path if success else None)
    else:
        image_paths.append(None)

# Add Image Path Column to DataFrame
df["Image_Path"] = image_paths
print("All images processed and stored in 'images/'")

# Optional: Save updated dataset with image paths
df.to_csv("task3_dataset_with_image_paths.csv", index=False, encoding="utf-8-sig")


Downloading images...


100%|██████████| 10/10 [00:54<00:00,  5.49s/it]

All images processed and stored in 'images/'





## Text Normalization

In [None]:
# === Character normalization ===
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text)

    # Persian/Arabic char replacements
    replacements = {
        "ي": "ی", "ك": "ک",  # Arabic to Persian
        "ئ": "ی", "أ": "ا", "إ": "ا", "ۀ": "ه", "ؤ": "و",
    }

    for src, dst in replacements.items():
        text = text.replace(src, dst)

    # Remove Arabic diacritics (harakat)
    text = re.sub(r"[\u064B-\u0652]", "", text)

    # Remove excessive whitespace & punctuation
    text = re.sub(r"[^\w\s\u0600-\u06FF]", " ", text)  # Keep Persian, English, numbers
    text = re.sub(r"\s+", " ", text).strip()

    return text


## Number Normalization

In [39]:
# === Convert Persian/Arabic numerals to English digits ===
def normalize_numbers(text):
    if pd.isna(text):
        return ""

    persian_to_english_digits = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
    text = str(text).translate(persian_to_english_digits)

    # Remove dashes, spaces, etc., and return as list of numbers
    numbers = re.split(r"[\s\-–—]+", text)
    clean_numbers = [re.sub(r"[^\d]", "", num) for num in numbers if re.sub(r"[^\d]", "", num)]

    return clean_numbers  # returns a list


## Apply Cleaning

In [43]:
# Normalize name column
df["Normalized_Name"] = df["Name"].apply(normalize_text)

# Normalize and split numbers
df["Normalized_Numbers"] = df["Number"].apply(normalize_numbers)

# Preview
df[["Name", "Normalized_Name", "Number", "Normalized_Numbers"]]


Unnamed: 0,Name,Normalized_Name,Number,Normalized_Numbers
0,فودلند,فودلند,-,[]
1,پیتزا فلای لند,پیتزا فلای لند,33290149 - 09116467781,"[33290149, 09116467781]"
2,آشپزخانه قصر,آشپزخانه قصر,33236321,[33236321]
3,فست فود نود,فست فود نود,2133239281,[2133239281]
4,کبابسرای غفاردایی,کبابسرای غفاردایی,9112815611,[9112815611]
5,آشپزخانه مهران,آشپزخانه مهران,1733230254,[1733230254]
6,ریزوتو,ریزوتو,33290352,[33290352]
7,Fahi yemek,Fahi yemek,09360934063 - 33241720,"[09360934063, 33241720]"
8,فست فود 20 فود,فست فود 20 فود,33331212,[33331212]
9,کبابی محمدی,کبابی محمدی,1733435687,[1733435687]
