In [4]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import requests

In [29]:
# csv_file_path = '/kaggle/input/job-ad-html/links.csv'
# df = pd.read_csv(csv_file_path)
    
# Define a function to process HTML content
def process_html_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    job_title_element = soup.find('span', class_='w-100 text-truncate font-weight-bold mb-1')
    company_name_element = soup.find('span', class_='w-100 text-truncate')
    
    location_element = soup.find('div', class_='text-black mb-1', string=re.compile('Location|محل کار'))
    location_value_element = location_element.find_next('div', class_='col-auto px-0 ng-star-inserted')

    city_name = location_value_element.find('span').get_text(strip=True) if location_value_element.find('span') else None
    neighborhood = location_value_element.get_text(strip=True).replace(city_name, '').strip()

    
    if neighborhood == city_name:
        neighborhood = None

    location = {
        'City': city_name,
        'Neighborhood': neighborhood
    }
    
    working_days_element = soup.find('div', class_='text-black mb-1', string=re.compile('Working days and hours|روز و ساعت کاری'))
    working_days = working_days_element.find_next('div').get_text(strip=True) if working_days_element else None
    
    job_type_element = soup.find('div', class_='text-black mb-1', string=re.compile('Contract type|نوع همکاری'))
    job_type = job_type_element.find_next('div').get_text(strip=True) if job_type_element else None
    

    key_indicator_section = soup.find('h1', string=re.compile('شاخص های کلیدی از نظر کارفرما|key Requirements'))
    key_indicators = []

    if key_indicator_section:
        parent_divs = key_indicator_section.find_all_next('div', class_=re.compile('col-12 mb-3 (pl-0|pr-0) ng-star-inserted'))

        for parent_div in parent_divs:
            key_requirement = parent_div.find('span', class_='col ml-2 px-0 word-break')
            if key_requirement:
                key_indicators.append(key_requirement.get_text(strip=True))
            else:
                key_requirement = parent_div.find('div', class_='row col-11 px-0')
                if key_requirement:
                    sub_items = key_requirement.find_all(['span', 'div'], recursive=False)
                    for item in sub_items:
                        key_indicators.append(item.get_text(strip=True))

    

    job_description_section = soup.find('h1', string=re.compile('Job Description|شرح شغل و وظایف'))
    job_description_list = []
    if job_description_section:
        job_description_items = job_description_section.find_next('div', class_='col-12 row text-black px-0 mb-3')
        if job_description_items:
            job_description_element = job_description_items.find('div', class_='col px-0 mr-2')
            if job_description_element:
                paragraphs = job_description_element.find_all('p')
                for p in paragraphs:
                    job_description_list.extend(p.stripped_strings)
        else:
            job_description_element = job_description_section.find_next('p')
            if job_description_element:
                job_description_list.extend(job_description_element.stripped_strings)
    else:
        job_description_list.append(None)  # Append None for rows without Job Description section


    
    condition_section = soup.find('h1', string=re.compile('شرایط احراز شغل|Job Requirements'))
    condition_list = []
    if condition_section:
        condition_items = condition_section.find_next('div', class_='col-12 row px-0')
        for item in condition_items.find_all('div', class_='col-12 row align-items-center justify-content-between px-0 mb-3 ng-star-inserted'):
            title = item.find('div', class_='requirement-title').get_text(strip=True)
            value = item.find('div', class_='requirement-value').get_text(strip=True)
            condition_list.append((title, value))

    
    data = {'Job Title': job_title_element.get_text(strip=True) if job_title_element else None,
            'Company Name': company_name_element.get_text(strip=True) if company_name_element else None,
            'Location': location,
            'Working Days': working_days,
            'Job Type': job_type,
            'Key Indicators': key_indicators,
            'Job Description': job_description_list,
            'Job Conditions': condition_list}
    
    return data

# Directory containing your text files with HTML content
html_files_dir = '/kaggle/input/job-ad-html'

# List to store processed data
data_list = []

# Process each text file in the directory
for filename in os.listdir(html_files_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(html_files_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        processed_data = process_html_content(html_content)
        data_list.append(processed_data)
        
# # Process each URL in the DataFrame
# for url in df['URL']:
#     response = requests.get(url)
#     html_content = response.content
#     processed_data = process_html_content(html_content)
#     data_list.append(processed_data)

# Create a DataFrame from the processed data
df = pd.DataFrame(data_list)

# Print the DataFrame
df.head(50)


Unnamed: 0,Job Title,Company Name,Location,Working Days,Job Type,Key Indicators,Job Description,Job Conditions
0,تحلیلگر داده,ارکید فارمد,"{'City': 'تهران', 'Neighborhood': '، ونک'}",شنبه تا چهارشنبه,تمام وقت,"[Python - پیشرفته, T-Sql - پیشرفته]",[خلق ارزش و آگاهی دادن به واحد های مختلف با کم...,"[(سن, 25 - 35 سال), (جنسیت, تفاوتی ندارد), (تح..."
1,کارشناس تحلیل داده و هوش تجاری,صنایع بسته بندی به بند,"{'City': 'تهران', 'Neighborhood': '، عباس آباد...",شنبه تا چهارشنبه ساعت 8:00 تا 17:00,تمام وقت,"[Sql Server - پیشرفته, PowerBI - پیشرفته, SSIS...","[وظایف اصلی, -طراحی انبار داده (Data Warehouse...","[(جنسیت, تفاوتی ندارد), (خدمت سربازی, اتمام خد..."
2,کارشناس آمار,ارکید فارمد,"{'City': 'تهران', 'Neighborhood': '، ونک'}",شنبه تا چهارشنبه,تمام وقت,"[Stata - متوسط, R - متوسط]","[·, کارشناسی ارشد آمار زیستی، ریاضی، بیمه و اق...","[(سن, 25 - 35 سال), (جنسیت, تفاوتی ندارد), (تح..."
3,برنامه نویس هوش مصنوعی,فراهوش,"{'City': 'شیراز', 'Neighborhood': '، آب جوار'}",شنبه تا پنجشنبه ساعت 8 تا 15,تمام وقت یا پاره وقت,[Python - متوسط],"[جای شما در تیم استارتاپی فراهوش خالیه...!, شر...","[(سن, 24 - 40 سال), (جنسیت, ترجیحاً خانم), (نر..."
4,کارشناس هوش تجاری (BI),گروه صنعتی زر,"{'City': 'کرج', 'Neighborhood': ''}",شنبه تا چهارشنبه 8 الی 17,تمام وقت,[PowerBI - متوسط],"[تسلط به نرم افزار هوش تجاری (POWER BI), آشن...","[(سن, 25 - 36 سال), (جنسیت, تفاوتی ندارد), (خد..."
5,Data Analyst,Esmiran International Trading Co,"{'City': 'Tehran', 'Neighborhood': ', Andarzgoo'}",Saturday to Wednesday From 8 to 16:30- Thursda...,Full Time,[4 years experience in similar position],[We are seeking a talented Data Analyst to joi...,"[(Age, 20 - 35 Years Old), (Gender, Men / Wome..."
6,Data Designer and Developer,Hafhashtad,"{'City': 'Tehran', 'Neighborhood': ', Vanak'}",شنبه تا چهارشنبه,Full Time,[2 years experience in similar position],[Our Data team at Haf-hashtad is looking for a...,"[(Age, 25 - 45 Years Old), (Gender, Men / Wome..."
7,کارشناس Power BI,انتخاب الکترونیک,"{'City': 'اصفهان', 'Neighborhood': ''}",شنبه تا چهارشنبه 8 الی 17,تمام وقت,[],[],"[(سن, 23 - 38 سال), (جنسیت, تفاوتی ندارد), (خد..."
8,Senior Data Engineer,Sheypoor,"{'City': 'Tehran', 'Neighborhood': ', Vanak'}",Saturday to Wednesday 9 till 18,Full Time,[3 years experience in similar position],[Sheypoor is seeking an accomplished senior da...,"[(Age, 23 - 33 Years Old), (Gender, Men / Wome..."
9,کارشناس تحلیلگر داده,هیژا دارو,"{'City': 'تهران', 'Neighborhood': '، میرداماد'}",شنبه تا چهارشنبه ساعت 8:00 الی 16:00 و پنجشن...,تمام وقت,"[Microsoft Word - متوسط, Microsoft Excel - متوسط]","[تحصیلات:, مدرک کارشناسی رشته صنایع، آمار و سا...","[(سن, 22 - 30 سال), (جنسیت, تفاوتی ندارد), (تح..."
