In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the Craigslist page to scrape
url = 'https://bangalore.craigslist.org/search/jjj#search=1~list~0~0'

# Make an HTTP request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the job listings in the ordered list (ol)
job_listings = soup.find_all('li', class_='cl-static-search-result')

# Empty list to store job data
jobs = []

# Loop through each job listing and extract the relevant details
for job in job_listings:
    title = job.find('div', class_='title')
    location = job.find('div', class_='location')
    price = job.find('div', class_='price')
    link = job.find('a', href=True)

    # Check if the required data is available
    if title and link:
        job_data = {
            'Job Title': title.get_text(strip=True),
            'Location': location.get_text(strip=True) if location else 'N/A',
            'Salary': price.get_text(strip=True) if price else 'N/A',
            'Link': link['href']
        }

        jobs.append(job_data)

# Create a pandas DataFrame from the job data
df_raw = pd.DataFrame(jobs)

# Save the raw data to 'rawdata.csv'
df_raw.to_csv('rawdata.csv', index=False)

print("Data scraped and saved to 'rawdata.csv'.")

# Step 2: Data Cleaning

# Read the raw data from the 'rawdata.csv' file
df = pd.read_csv('rawdata.csv')

# 1. Remove duplicates
df.drop_duplicates(inplace=True)

# 2. Handle missing values
# Replace missing Salary values with 'Salary Not Provided'
df['Salary'].fillna('Salary Not Provided', inplace=True)
# Replace missing Location values with 'Location Not Provided'
df['Location'].fillna('Location Not Provided', inplace=True)
# Replace missing Job Title values with 'Job Title Not Provided'
df['Job Title'].fillna('Job Title Not Provided', inplace=True)

# 3. Standardize the Salary format (remove any unwanted symbols like '$', ',')
df['Salary'] = df['Salary'].replace({'\$': '', ',': ''}, regex=True)

# 4. Clean Location: Remove leading and trailing spaces
df['Location'] = df['Location'].str.strip()

# 5. Clean Job Title: Remove extra spaces and unwanted symbols
df['Job Title'] = df['Job Title'].str.strip()

# 6. Remove any rows where the 'Job Title' is 'Job Title Not Provided' (which means the scraping failed)
df = df[df['Job Title'] != 'Job Title Not Provided']

# 7. Ensure all columns have consistent casing (e.g., title case)
df['Job Title'] = df['Job Title'].str.title()
df['Location'] = df['Location'].str.title()

# Save the cleaned data to 'cleaned_data.csv'
df.to_csv('cleaned_data.csv', index=False)

print("Data cleaned and saved to 'cleaned_data.csv'.")


Data scraped and saved to 'rawdata.csv'.
Data cleaned and saved to 'cleaned_data.csv'.
