In [1]:
import pymysql
import time
import pandas as pd
from sqlalchemy import create_engine, text

# Step 1: Establish Connection to MySQL
db_url = "mysql+pymysql://root:mysql@localhost/sakila"  # Update with actual credentials
engine = create_engine(db_url)
print("Connected to MySQL successfully!")

# Step 2: Drop and Recreate Employees Table
with engine.connect() as conn:
    conn.execute(text("DROP TABLE IF EXISTS employees;"))
    conn.execute(text("""
        CREATE TABLE employees (
            id INT AUTO_INCREMENT PRIMARY KEY,
            name VARCHAR(100),
            department VARCHAR(50),
            salary DECIMAL(10,2),
            hire_date DATE
        );
    """))
    print("Employees table dropped and recreated successfully.")
    
    # Insert 20 sample rows
    employees_data = [
        {"name": f"Employee{i}", "department": "Department" + str(i % 5), "salary": 50000 + (i * 1000), "hire_date": f"202{(i%3)+1}-0{(i%9)+1}-15"}
        for i in range(1, 21)
    ]
    conn.execute(text("""
        INSERT INTO employees (name, department, salary, hire_date) VALUES 
        (:name, :department, :salary, :hire_date);
    """), employees_data)
    conn.commit()
    print("Inserted 20 sample employee records successfully.")

Connected to MySQL successfully!
Employees table dropped and recreated successfully.
Inserted 20 sample employee records successfully.


In [2]:
# Step 3: Define Function for Pagination Using LIMIT/OFFSET
def fetch_data_with_pagination(limit, offset):
    query = text("SELECT * FROM employees LIMIT :limit OFFSET :offset;")
    with engine.connect() as conn:
        result = conn.execute(query, {"limit": limit, "offset": offset})
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

# Step 4: Fetch Large Dataset in Chunks (Batch Processing)
chunk_size = 5  # Adjust chunk size as needed
offset = 0
print("Fetching data in batches using OFFSET:")
while True:
    df_chunk = fetch_data_with_pagination(chunk_size, offset)
    if df_chunk.empty:
        break
    print(f"\nBatch {offset // chunk_size + 1}:")
    print(df_chunk)
    offset += chunk_size

print("\nAll data retrieved using batch processing.")

Fetching data in batches using OFFSET:

Batch 1:
   id       name   department    salary   hire_date
0   1  Employee1  Department1  51000.00  2022-02-15
1   2  Employee2  Department2  52000.00  2023-03-15
2   3  Employee3  Department3  53000.00  2021-04-15
3   4  Employee4  Department4  54000.00  2022-05-15
4   5  Employee5  Department0  55000.00  2023-06-15

Batch 2:
   id        name   department    salary   hire_date
0   6   Employee6  Department1  56000.00  2021-07-15
1   7   Employee7  Department2  57000.00  2022-08-15
2   8   Employee8  Department3  58000.00  2023-09-15
3   9   Employee9  Department4  59000.00  2021-01-15
4  10  Employee10  Department0  60000.00  2022-02-15

Batch 3:
   id        name   department    salary   hire_date
0  11  Employee11  Department1  61000.00  2023-03-15
1  12  Employee12  Department2  62000.00  2021-04-15
2  13  Employee13  Department3  63000.00  2022-05-15
3  14  Employee14  Department4  64000.00  2023-06-15
4  15  Employee15  Department0  6500