## Crawl captions and images from https://www.reddit.com/r/progresspics/search/?q=F&type=link&cId=857c3e1c-67e4-4d52-b1be-070dac965979&iId=f9ce14d0-f25f-4ec8-8acc-3bfec57647ce

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import os
import pandas as pd

# Function to create a directory to store images
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to download an image from URL and save it to the directory
def download_image(url, filename, directory):
    response = requests.get(url)
    with open(os.path.join(directory, filename), 'wb') as f:
        f.write(response.content)

# Initialize the Selenium web driver
driver = webdriver.Chrome()

# URL of the Reddit page
url = "https://www.reddit.com/r/progresspics/search/?q=F&type=link&cId=857c3e1c-67e4-4d52-b1be-070dac965979&iId=f9ce14d0-f25f-4ec8-8acc-3bfec57647ce"

# Open the web page using Selenium web driver
driver.get(url)

# Function to scroll down the page to load more images
def scroll_down_page(driver):
    # Scroll down to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait for 2 seconds after scrolling

# Scroll down the page multiple times to load more images
while True:  
    # Get the initial height of the webpage
    last_height = driver.execute_script("return document.body.scrollHeight")
    # Scroll down the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait for the new images to load
    time.sleep(2)
    # Calculate new height and compare with last height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break  # If no more images are loaded, exit the loop
    last_height = new_height

# Use BeautifulSoup to parse HTML after the page has loaded completely
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all 'img' tags in HTML
image_tags = soup.find_all('img', class_='media-lightbox-img')

# Find all post titles and their corresponding URLs
post_links = soup.find_all('a', slot='title')

# Directory to store images
image_directory = 'progress_pics'
create_directory(image_directory)

# Lists to store image URLs and captions
image_names = []
captions = []

# Loop through each 'img' tag to get the URL and download the image
for i, (tag, post_link) in enumerate(zip(image_tags, post_links)):
    if 'src' in tag.attrs:  # Check if the 'src' attribute exists
        img_url = tag['src']
        img_name = f"image_{i+1}.jpg"  # Assign a unique name to each image
        download_image(img_url, img_name, image_directory)
        image_names.append(img_name)
        caption = post_link.text.strip()  # Get the text of the post link (caption)
        captions.append(caption)

# Close the web browser
driver.quit()

# Create a DataFrame
df = pd.DataFrame({'Image_Name': image_names, 'Caption': captions})

# Save the DataFrame to a CSV file
df.to_csv('image_data.csv', index=False)

print("CSV file created successfully!")


CSV file created successfully!


## Split captions into Sex, Age, Height and Weight

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('image_data.csv')

# Function to extract gender from caption
def extract_gender(caption):
    return caption.split('/')[0]

# Function to extract age from caption
def extract_age(caption):
    return caption.split('/')[1].split(' ')[0]

def extract_height(caption):
    parts = caption.split('/')
    if len(parts) > 2:
        height_str = parts[2].strip().split('[')[0].split()[0]  # Lấy phần tử đầu tiên sau khi split theo khoảng trắng và trước dấu '['
        return height_str
    return None


# Function to extract weight from caption
def extract_weight(caption):
    weight = None
    # Find the substring within square brackets
    start_index = caption.find('>')
    end_index = caption.find('=')
    if start_index != -1:
        if end_index != -1:
            weight_str = caption[start_index+1:end_index].strip()
        else:
            weight_str = caption[start_index+1:].strip()
        if weight_str:
            # Extract the numbers after '>'
            weight_str = weight_str.split()[0]  # Lấy số đầu tiên sau dấu space
            weight_str = weight_str.replace('lbs', '')  # Loại bỏ từ 'lbs'
            if weight_str.isdigit():
                weight = int(weight_str)
    return weight


# Apply the functions to create new columns
df['Gender'] = df['Caption'].apply(extract_gender)
df['Age'] = df['Caption'].apply(extract_age)
df['Height'] = df['Caption'].apply(extract_height)
df['Weight'] = df['Caption'].apply(extract_weight)

# Save the DataFrame to a new CSV file
df.to_csv('new_image_data.csv', index=False)

print("CSV file created successfully!")


## Caculate BMI

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('new_image_data.csv')

# Function to convert height from feet and inches to meters
def convert_height_to_meters(height):
    if "'" in height or "’" in height:
        height_parts = height.replace("’", "'").split("'")  # Thay dấu ’ thành ' và split theo dấu '
        feet = float(height_parts[0])
        if len(height_parts) > 1:
            inches_str = height_parts[1].replace("\"", "").strip()  # Xóa dấu " và khoảng trắng
            if inches_str.isdigit():
                inches = float(inches_str)
            else:
                inches = 0.0
        else:
            inches = 0.0
        total_inches = feet * 12 + inches
        height_meters = total_inches * 0.0254  # Convert inches to meters
        return height_meters
    else:
        return None


# Function to calculate BMI
def calculate_bmi(height, weight):
    if height is not None and weight is not None:
        height_m = convert_height_to_meters(height)
        weight_kg = weight * 0.453592  # Convert weight from lbs to kg
        if height_m:
            bmi = weight_kg / (height_m ** 2)  # Calculate BMI
            return bmi
    return None

# Calculate BMI and add a new column 'BMI'
df['BMI'] = df.apply(lambda row: calculate_bmi(row['Height'], row['Weight']), axis=1)


# Save the DataFrame to the same CSV file, overwriting it
df.to_csv('new_image_data.csv', index=False)

print("CSV file updated with BMI successfully!")


## Category 

In [None]:
# Function to classify BMI based on gender
def classify_bmi_with_gender(bmi, gender):
    if bmi is not None and gender is not None:
        if gender == 'M':
            if bmi < 18.5:
                return 'Underweight'
            elif 18.5 <= bmi < 24.9:
                return 'Normal weight'
            elif 24.9 <= bmi < 29.9:
                return 'Overweight'
            else:
                return 'Obese'
        elif gender == 'F':
            if bmi < 18.5:
                return 'Underweight'
            elif 18.5 <= bmi < 24.9:
                return 'Normal weight'
            elif 24.9 <= bmi < 29.9:
                return 'Overweight'
            else:
                return 'Obese'
    return None

# Apply the function to create a new column 'BMI Classification with Gender'
df['BMI Classification with Gender'] = df.apply(lambda row: classify_bmi_with_gender(row['BMI'], row['Gender']), axis=1)


# Save the DataFrame to the same CSV file, overwriting it
df.to_csv('new_image_data.csv', index=False)

print("CSV file updated with BMI classification successfully!")

