<a href="https://colab.research.google.com/github/ashish26meshram/Assigment-5/blob/main/scrap_data_code5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd

def extract_person_name(html_content):
    # Patterns to identify person name
    name_patterns = [
        re.compile(r'<h1[^>]*>\s*([^<]+)\s*</h1>', re.IGNORECASE),
        re.compile(r'<span class="me-2[^>]*>\s*([^<]+)\s*</span>', re.IGNORECASE),
        re.compile(r'<font style="vertical-align: inherit;">\s*([^<]+)\s*</font>', re.IGNORECASE)
    ]

    for pattern in name_patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return ''

def extract_person_title(html_content):
    # Patterns to identify person title
    title_patterns = [
        re.compile(r'<h1[^>]*>\s*([^<]+)\s*(?:<span class="[^"]*">([^<]+)</span>)?\s*</h1>', re.IGNORECASE),
        re.compile(r'<strong>\s*([^<]+)\s*</strong>', re.IGNORECASE),
        re.compile(r'<p class="lawyer-titles">\s*([^<]+)\s*</p>', re.IGNORECASE),
        re.compile(r'<span class="card--role">\s*([^<]+)\s*</span>', re.IGNORECASE),
        re.compile(r'<p class="grade">\s*([^<]+)\s*</p>', re.IGNORECASE),
        re.compile(r'<h3 class="">\s*([^<]+)\s*</h3>', re.IGNORECASE)
    ]

    for pattern in title_patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return ''

def extract_person_email(html_content):
    # Patterns to identify person email
    email_patterns = [
        re.compile(r'<a[^>]*href="mailto:([^"]+)"[^>]*>', re.IGNORECASE),
        re.compile(r'<p>\s*([^<]+)\s*</p>', re.IGNORECASE)
    ]

    for pattern in email_patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return ''

def extract_person_phone(html_content):
    # Patterns to identify person phone number
    phone_patterns = [
        re.compile(r'<a[^>]*href="tel:([^"]+)"[^>]*>', re.IGNORECASE),
        re.compile(r'<p>\s*([^<]+)\s*</p>', re.IGNORECASE),
        re.compile(r'<div[^>]*>\s*([^<]+)\s*</div>', re.IGNORECASE),
        re.compile(r'<font style="vertical-align: inherit;">\s*Tel:\s*([^<]+)\s*</font>', re.IGNORECASE),
        re.compile(r'<span class="desktop">\s*([^<]+)\s*</span>', re.IGNORECASE),
        re.compile(r'\bM\b.*\b\+\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b', re.IGNORECASE),
        re.compile(r'\bPhone\b.*\b\+\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b.*\b\d{2,}\b', re.IGNORECASE)
    ]

    for pattern in phone_patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return ''

def extract_person_city(html_content):
    # Patterns to identify person city
    city_patterns = [
        re.compile(r'<span class="icon location">([^<]+)</span>', re.IGNORECASE),
        re.compile(r'<div class="info_person_content">.*?<b>([^<]+)</b>', re.IGNORECASE),
        re.compile(r'<a[^>]*class="text-decoration-underline-hover[^"]*"[^>]*>\s*([^<]+)\s*</a>', re.IGNORECASE),
        re.compile(r'<p>\s*([^<]+)\s*</p>', re.IGNORECASE)
    ]

    for pattern in city_patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return ''

def extract_practice_area(html_content):
    # Patterns to identify practice areas
    practice_area_patterns = [
        re.compile(r'\b(?:Areas of focus|Key Activities|Technical Expertise|Technical focus|Experience|Overview|Detailed profile|Legal expertises|Focus of legal work|Training|Technical areas)\b[^>]*>\s*([^<]+)\s*<', re.IGNORECASE)
    ]

    practice_areas = []
    for pattern in practice_area_patterns:
        matches = pattern.finditer(html_content)
        for match in matches:
            practice_areas.append(match.group(1).strip())

    return practice_areas

def extract_legal_activities(html_content):
    # Keywords for legal activities
    prosecution_keywords = ['Prosecution', 'Application Perpetration', 'Drafting', 'Filing', 'patent Filing',
                            'Negotiating', 'Patentability Prosecutions and registration', 'Protection',
                            'Patent Searches', 'Novelty (Patentability)', 'Freedom To Operate (FTO)',
                            'Non-Infringement', 'Validity', 'patent analyses', 'Patent Application Proceeding',
                            'Patent counselling', 'patent analysing']

    litigation_keywords = ['Litigation', 'Infringement', 'infringement cases', 'enforcement']

    licensing_keywords = ['licensing', 'licenses', 'License']

    prosecution_found = any(keyword.lower() in html_content.lower() for keyword in prosecution_keywords)
    litigation_found = any(keyword.lower() in html_content.lower() for keyword in litigation_keywords)
    licensing_found = any(keyword.lower() in html_content.lower() for keyword in licensing_keywords)

    return {
        'Prosecution': 'Prosecution' if prosecution_found else 'Not Found',
        'Litigation': 'Litigation' if litigation_found else 'Not Found',
        'Licensing': 'Licensing' if licensing_found else 'Not Found',
    }

def scrape_person_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
        return {}
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
        return {}
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
        return {}
    except requests.exceptions.RequestException as err:
        print(f"Error: {err}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting person name
    person_name_element = soup.find_all(['h1', 'span', 'font'], text=True)
    person_name_content = ' '.join([str(element) for element in person_name_element])
    person_name = extract_person_name(person_name_content)

    # Extracting person title
    person_title_element = soup.find_all(['h1', 'strong', 'p', 'span', 'h3'], text=True)
    person_title_content = ' '.join([str(element) for element in person_title_element])
    person_title = extract_person_title(person_title_content)

    # Extracting person email
    person_email_element = soup.find_all(['a', 'p'], text=True)
    person_email_content = ' '.join([str(element) for element in person_email_element])
    person_email = extract_person_email(person_email_content)

    # Extracting person phone
    person_phone_element = soup.find_all(['a', 'p', 'div', 'font', 'span'], text=True)
    person_phone_content = ' '.join([str(element) for element in person_phone_element])
    person_phone = extract_person_phone(person_phone_content)

    # Extracting person city
    person_city_element = soup.find_all(['span', 'div', 'a', 'p'], text=True)
    person_city_content = ' '.join([str(element) for element in person_city_element])
    person_city = extract_person_city(person_city_content)

    # Extracting practice areas
    practice_area_element = soup.find_all(['h2', 'h3', 'h1', 'p', 'font'], text=True)
    practice_area_content = ' '.join([str(element) for element in practice_area_element])
    practice_areas = extract_practice_area(practice_area_content)

    # Extracting legal activities
    legal_activities_content = ' '.join([str(element) for element in soup.find_all(['h2', 'h3', 'h1', 'p', 'font'], text=True)])
    legal_activities = extract_legal_activities(legal_activities_content)

    # Check for keywords in the extracted content
    keywords = ['IP', 'Patent', 'Trademark']
    ip_found = any(keyword.lower() in ' '.join(practice_areas).lower() for keyword in keywords[:1])
    patent_found = any(keyword.lower() in ' '.join(practice_areas).lower() for keyword in keywords[1:2])
    trademark_found = any(keyword.lower() in ' '.join(practice_areas).lower() for keyword in keywords[2:])

    return {
        'Url_link': url,  # Link column
        'Persone_name': person_name,
        'persone_Title': person_title,
        'persone_Email_id': person_email,
        'IP': 'IP' if ip_found else 'Not Found',
        'Patent': 'Patent' if patent_found else 'Not Found',
        'Treadmakr': 'Trademark' if trademark_found else 'Not Found',
        'Prosecution': legal_activities['Prosecution'],
        'Litigation': legal_activities['Litigation'],
        'Licensing': legal_activities['Licensing'],
        'persone phone number': person_phone,
        'Persone_city': person_city
    }

def scrape_multiple_links(urls):
    data_list = []
    for url in urls:
        person_data = scrape_person_data(url)
        if person_data:
            data_list.append(person_data)

    return data_list

def save_to_csv(data_list, csv_file_path):
    if not data_list:
        print("No data to save.")
        return

    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Url_link', 'Persone_name', 'persone_Title', 'persone_Email_id', 'IP', 'Patent', 'Treadmakr', 'Prosecution', 'Litigation', 'Licensing', 'persone phone number', 'Persone_city'])
        for data in data_list:
            csv_writer.writerow([data[col] for col in ['Url_link', 'Persone_name', 'persone_Title', 'persone_Email_id', 'IP', 'Patent', 'Treadmakr', 'Prosecution', 'Litigation', 'Licensing', 'persone phone number', 'Persone_city']])

def main():
    # Example URLs
    urls = [
        'https://example.com/person1',
        'https://example.com/person2',
        'https://example.com/person3',
        # Add more URLs as needed
    ]

    data_list = scrape_multiple_links(urls)
    save_to_csv(data_list, 'output_data.csv')

if __name__ == "__main__":
    main()


In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import re

def extract_person_name(html_content):
    # Add patterns to identify person name
    patterns = [
        re.compile(r'<h1[^>]*>(.*?)</h1>', re.IGNORECASE),
        re.compile(r'<span[^>]*class=["\']?me-2[^>]*>(.*?)</span>', re.IGNORECASE),
        re.compile(r'<font[^>]*style=["\']?vertical-align: inherit;[^>]*>(.*?)</font>', re.IGNORECASE)
    ]

    for pattern in patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return 'Not Found'

def extract_person_title(html_content):
    # Add patterns to identify person title
    patterns = [
        re.compile(r'<h1[^>]*>(.*?)</h1>', re.IGNORECASE),
        re.compile(r'<strong[^>]*>(.*?)</strong>', re.IGNORECASE),
        re.compile(r'<p[^>]*class=["\']?card--role[^>]*>(.*?)</p>', re.IGNORECASE),
        re.compile(r'<span[^>]*class=["\']?styles__type__position[^>]*>(.*?)</span>', re.IGNORECASE),
        re.compile(r'<h3[^>]*class=["\']?page-attorney-headline[^>]*>(.*?)</h3>', re.IGNORECASE)
    ]

    for pattern in patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return 'Not Found'

def extract_person_email(html_content):
    # Add patterns to identify person email
    patterns = [
        re.compile(r'<a[^>]*href=["\']?mailto:(.*?)["\']?[^>]*>', re.IGNORECASE),
        re.compile(r'<p[^>]*>[^<]*[Ee]-?mail[^<]*:([^<]*)</p>', re.IGNORECASE),
        re.compile(r'<span[^>]*class=["\']?desktop email--obfuscate[^>]*>(.*?)</span>', re.IGNORECASE)
    ]

    for pattern in patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return 'Not Found'

def extract_person_phone(html_content):
    # Add patterns to identify person phone number
    patterns = [
        re.compile(r'<a[^>]*href=["\']?tel:(.*?)["\']?[^>]*>', re.IGNORECASE),
        re.compile(r'<p[^>]*>Tel[^<]*:([^<]*)</p>', re.IGNORECASE),
        re.compile(r'<div[^>]*class=["\']?left_assign[^>]*>(.*?)</div>', re.IGNORECASE),
        re.compile(r'\b(?:\+?\d{1,4}[-.●])?(?:\(\d{1,4}\)[-.\●])?\d{1,12}[-.●]?\d{1,12}[-.●]?\d{1,9}\b', re.IGNORECASE)
    ]

    for pattern in patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return 'Not Found'

def extract_person_city(html_content):
    # Add patterns to identify person city
    patterns = [
        re.compile(r'<span[^>]*class=["\']?icon location[^>]*>(.*?)</span>', re.IGNORECASE),
        re.compile(r'<a[^>]*class=["\']?styles__type__officeName[^>]*>(.*?)</a>', re.IGNORECASE),
        re.compile(r'<p[^>]*>([^<]*?office[^<]*?)</p>', re.IGNORECASE),
        re.compile(r'\b(?:London|New York|Germany|Australia)\b', re.IGNORECASE)
    ]

    for pattern in patterns:
        match = pattern.search(html_content)
        if match:
            return match.group(1).strip()

    return 'Not Found'

def extract_practice_area(html_content):
    # Add patterns to identify practice areas
    patterns = [
        re.compile(r'<h2[^>]*>(.*?)</h2>', re.IGNORECASE),
        re.compile(r'<h3[^>]*>(.*?)</h3>', re.IGNORECASE),
        re.compile(r'<h1[^>]*>(.*?)</h1>', re.IGNORECASE),
        re.compile(r'<p[^>]*class=["\']?lawyer-titles[^>]*>(.*?)</p>', re.IGNORECASE),
        re.compile(r'<font[^>]*style=["\']?vertical-align: inherit;[^>]*>(.*?)</font>', re.IGNORECASE)
    ]

    practice_areas = []
    for pattern in patterns:
        matches = pattern.finditer(html_content)
        for match in matches:
            practice_areas.append(match.group(1).strip())

    return practice_areas

def extract_legal_activities(html_content):
    # Keywords for legal activities
    prosecution_keywords = ['Prosecution', 'Application Perpetration', 'Drafting', 'Filing', 'patent Filing',
                            'Negotiating', 'Patentability Prosecutions and registration', 'Protection',
                            'Patent Searches', 'Novelty (Patentability)', 'Freedom To Operate (FTO)',
                            'Non-Infringement', 'Validity', 'patent analyses', 'Patent Application Proceeding',
                            'Patent counselling', 'patent analysing']

    litigation_keywords = ['Litigation', 'Infringement', 'infringement cases', 'enforcement']

    licensing_keywords = ['licensing', 'licenses', 'License']

    prosecution_found = any(keyword.lower() in html_content.lower() for keyword in prosecution_keywords)
    litigation_found = any(keyword.lower() in html_content.lower() for keyword in litigation_keywords)
    licensing_found = any(keyword.lower() in html_content.lower() for keyword in licensing_keywords)

    return {
        'Prosecution': 'Prosecution' if prosecution_found else 'Not Found',
        'Litigation': 'Litigation' if litigation_found else 'Not Found',
        'Licensing': 'Licensing' if licensing_found else 'Not Found',
    }

def scrape_person_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
        return {}
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
        return {}
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
        return {}
    except requests.exceptions.RequestException as err:
        print(f"Something went wrong: {err}")
        return {}

    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    person_name = extract_person_name(str(soup))
    person_title = extract_person_title(str(soup))
    person_email = extract_person_email(str(soup))
    person_phone = extract_person_phone(str(soup))
    person_city = extract_person_city(str(soup))
    practice_areas = extract_practice_area(str(soup))
    legal_activities = extract_legal_activities(str(soup))

    data = {
        'Url_link': url,
        'Persone_name': person_name,
        'persone_Title': person_title,
        'persone_Email_id': person_email,
        'IP': ', '.join(practice_areas),
        'Patent': ', '.join(practice_areas),
        'Treadmakr': ', '.join(practice_areas),
        **legal_activities,
        'persone phone number': person_phone,
        'Persone_city': person_city
    }

    return data

def scrape_multiple_links_from_csv(csv_file_path):
    data_list = []

    with open(csv_file_path, 'r') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            url = row['Link']
            scraped_data = scrape_person_data(url)
            data_list.append(scraped_data)

    return data_list

def save_to_csv(data_list, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=['Url_link', 'Persone_name', 'persone_Title', 'persone_Email_id',
                                                           'IP', 'Patent', 'Treadmakr', 'Prosecution', 'Litigation',
                                                           'Licensing', 'persone phone number', 'Persone_city'])
        csv_writer.writeheader()
        for data in data_list:
            csv_writer.writerow(data)

def main():
    input_csv_file = 'input_links.csv'
    output_csv_file = 'output_data.csv'

    data_list = scrape_multiple_links_from_csv(input_csv_file)
    save_to_csv(data_list, output_csv_file)

if __name__ == "__main__":
    main()
