In [1]:
!pip install Zenrows
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup
from zenrows import ZenRowsClient
import re

# Read the DataFrame from the provided Excel file
df = pd.read_excel("./Project EXCEL FAST PEOPLE SEARCH (2) (1) (1).xlsx")

# Create an empty DataFrame to store the URLs
url_df = pd.DataFrame(columns=['URL'])

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    # Get the address components from the DataFrame
    street_address =str(row['Mailing Address'])
    city = str(row['Mailing City'])
    state = str(row['Mailing State'])
    
    # Format the address components for the URL
    formatted_street_address = urllib.parse.quote(street_address)
    formatted_citystatezip = urllib.parse.quote(f"{city},{state}")
    
    # Construct the URL
    url = f"https://www.truepeoplesearch.com/resultaddress?streetaddress={formatted_street_address}&citystatezip={formatted_citystatezip}"
    
    # Create a temporary DataFrame for the URL
    temp_df = pd.DataFrame({'URL': [url]})
    
    # Concatenate the temporary DataFrame with the main DataFrame
    url_df = pd.concat([url_df, temp_df], ignore_index=True)

# Save the DataFrame to a CSV file
url_df.to_csv("urls.csv", index=False)

# Create a ZenRowsClient instance
client = ZenRowsClient("840911f2101dafeeb3b4c3f882bd083d4673a02e")

# Create an empty list to store the matched records
matched_records = []

# Iterate over the URLs in 'url_df' and compare with the corresponding entries in the 'short_data (1) (2)' DataFrame
for index, row in url_df.iterrows():
    url = row['URL']
    params = {"premium_proxy": "true"}
    
    try:
        # Perform the request to fetch the HTML content
        response = client.get(url, params=params)
        response.raise_for_status()  # Check for any HTTP errors
        
        # Assume `html` contains the HTML content of the page
        html = response.text
        
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find all <div> elements with class="col-md-8"
        div_col_md_8_list = soup.find_all('div', class_='col-md-8')
        
        # Extract the text from each <div> element with class="h4" inside the <div class="col-md-8">
        h4_texts = []
        for div_col_md_8 in div_col_md_8_list:
            h4_div = div_col_md_8.find('div', class_='h4')
            if h4_div is not None:
                h4_text = h4_div.get_text().strip()
                h4_texts.append(h4_text)
        
        # Find all elements with the data-detail-link attribute
        elements = soup.find_all(attrs={'data-detail-link': True})
        
        # Extract the href attribute from each element
        links = []
        for element in elements:
            href_link = element.get('data-detail-link')
            links.append(href_link)
        
        # Create a dictionary with the extracted data
        data = {
            "Name": h4_texts,
            "Link": links
        }
        
        # Create the DataFrame for the extracted data
        df_extracted = pd.DataFrame(data)
        
        # Get the corresponding entry from the 'short_data (1) (2)' DataFrame
        df_short_data_entry = df.loc[index]

        # Get the first name from the 'short_data (1) (2)' DataFrame entry and strip whitespaces
        owner1_first_name = df_short_data_entry['Owner 1 First Name']
        if not isinstance(owner1_first_name, float):
            owner1_first_name = owner1_first_name.strip()
        else:
            owner1_first_name = "Not found"
        
        owner2_first_name = df_short_data_entry['Owner 2 First Name']
        if not isinstance(owner2_first_name, float):
            owner2_first_name = owner2_first_name.strip()
        else:
            owner2_first_name = "Not found"

        # Filter the extracted DataFrame to match the first name
        if owner1_first_name and not isinstance(owner1_first_name, float):
            # Filter using regular expression to match the first name
            regex = re.compile(r"\b" + re.escape(owner1_first_name.lower()) + r"\b")
            df_matched_records = df_extracted[df_extracted['Name'].str.lower().str.contains(regex)]
        elif owner2_first_name:
            # Filter using regular expression to match the first name
            regex = re.compile(r"\b" + re.escape(owner2_first_name.lower()) + r"\b")
            df_matched_records = df_extracted[df_extracted['Name'].str.lower().str.contains(regex)]
        else:
            print("Not found")
            df_matched_records = pd.DataFrame(columns=["Name", "Link"])  # Create an empty DataFrame

        # Print the matched records to check
        print(f"Extracted Data for URL {index}:\n{df_extracted}")
        print(f"Matched Records for URL {index}:\n{df_matched_records}")

        # Append the matched records to the main list
        matched_records.append(df_matched_records)
        
    except Exception as e:
        # If there's an error with the current URL, handle it here
        # You can print an error message or log the error for further analysis
        print(f"Error occurred for URL {index}: {e}")
        # You can choose to skip this URL and continue with the next one, or take any other appropriate action

# Concatenate all matched records into the final DataFrame
df_final = pd.concat(matched_records, ignore_index=True)

# Print the matched records and the final DataFrame
print("Matched Records:")
print(df_final)

df_final.to_csv("./final_links.csv")

# Rest of the code...
# (Continue with the rest of the program as before)




Extracted Data for URL 0:
                        Name                                Link
0           Kasey M Chambers   /find/person/p2nl490268uuu9r6u08n
1         Gregory L Pleasant   /find/person/p660rr0r48ul494l8rl6
2         Lonnie A Carpenter  /find/person/pxluu4ulr840nllu00rr2
3           Thomas E Shannon   /find/person/p209r28rlu2l86846r86
4          Demarcus Pleasant   /find/person/pn026206642r4rnl84u2
5          Bernice S Simpson  /find/person/px8r2l2ruln922690n202
6                 Owen Carty  /find/person/px22264uun2l42llru8u2
7  Magi Carpenter Carpenters  /find/person/pxu9nru88rnru462r8r82
8             Dennis C Scott    /find/person/p48nnn4r242n84l2r2l
9         Gregory L Pleasant    /find/person/pu298nn8292u0uln8l9
Matched Records for URL 0:
                 Name                               Link
1  Gregory L Pleasant  /find/person/p660rr0r48ul494l8rl6
9  Gregory L Pleasant   /find/person/pu298nn8292u0uln8l9
Extracted Data for URL 1:
                       Name        

KeyboardInterrupt: 

In [None]:
# Read the 'final_url' DataFrame from the CSV file and set the index to be the first column
final_url = pd.read_csv("./final_links.csv", index_col=0)

# Drop the 'Name' column from the DataFrame
final_url.drop("Name", axis=1, inplace=True)

# Add "https://www.truepeoplesearch.com" to each URL
final_url['Link'] = "https://www.truepeoplesearch.com" + final_url['Link']

# Print the updated DataFrame
print(final_url)

final_url.to_csv("./final_urls_ready.csv")



In [None]:
# Read the DataFrame from the provided CSV file
url_df = pd.read_csv("./final_urls_ready.csv")

# Initialize empty lists to store the data
titles = []
ages = []
addresses = []  # Corrected list name
cities = []  # Add new list for city
states = []  # Add new list for state
zip_codes = []  # Add new list for zip code
home_specs = []  # Add new list for home specs
residential_areas = []  # Add new list for residential area
living_time_periods = []  # Add new list for living time period
phone_1=[]
phone_1_type=[]
phone_2=[]
phone_2_type=[]
phone_3=[]
phone_3_type=[]

# Iterate over the URLs in the DataFrame
for url in url_df['Link']:
    # Make a request using ZenRowsClient
    response = client.get(url, params=params)

    # Parse the HTML content using BeautifulSoup
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    elements = soup.find_all(class_="content-container container-fluid row pl-0 pr-0")
    if len(elements) >= 2:
        element2 = elements[1]
        element_depth = element2.find(class_="content-center")
        element_more_depth = element_depth.find(class_="card card-body shadow-form pt-2")
        element_more_more_depth = element_more_depth.find(class_="row pl-md-2")
        element_more_more_more_depth = element_more_more_depth.find(class_="col")

        title = element_more_more_more_depth.find("h1").get_text(strip=True) if element_more_more_more_depth else "Not found"
        age_text_list = [span.get_text(strip=True) for span in element_more_more_more_depth.find_all("span")] if element_more_more_more_depth else []
    else:
        title = "Not found"
        age_text_list = []

    address_element = soup.find('span', itemprop='streetAddress')

    # Get the text of the element (address) or set to "Not found" if element is not found
    address = address_element.get_text(strip=True) if address_element else "Not found"
    # Additional elements for city, state, zip code, home specs, area, living time period, and phone numbers
    city_element = soup.find('span', itemprop='addressLocality')
    city = city_element.get_text(strip=True) if city_element else "Not found"
    cities.append(city)

    state_element = soup.find('span', itemprop='addressRegion')
    state = state_element.get_text(strip=True) if state_element else "Not found"
    states.append(state)

    zip_element = soup.find('span', itemprop='postalCode')
    zip_code = zip_element.get_text(strip=True) if zip_element else "Not found"
    zip_codes.append(zip_code)
    
    home_specs_element = soup.find('span', class_="dt-sb")
    home_specs_value = home_specs_element.get_text(strip=True) if home_specs_element else "Not found"
    home_specs.append(home_specs_value)

    area_elements = soup.find_all('span', class_="dt-sb")
    residential_area = area_elements[1].get_text(strip=True) if len(area_elements) >= 2 else "Not found"
    residential_areas.append(residential_area)

    living_time_period = area_elements[2].get_text(strip=True) if len(area_elements) >= 3 else "Not found"
    living_time_periods.append(living_time_period)

    phone1_element = soup.find('span', itemprop='telephone')
    phone1 = phone1_element.get_text(strip=True) if phone1_element else "Not found"
    phone_1.append(phone1)

    phone_type_element = soup.find('span', class_="smaller")
    phone_type = phone_type_element.get_text(strip=True) if phone_type_element else "Not found"
    phone_1_type.append(phone_type)

    phone2_element = soup.find_all('span', itemprop='telephone')
    phone2 = phone2_element[1].get_text(strip=True) if len(phone2_element) >= 2 else "Not found"
    phone_2.append(phone2)

    phone2_type_element = soup.find_all('span', class_='smaller')
    phone2_type = phone2_type_element[1].get_text(strip=True) if len(phone2_type_element) >= 2 else "Not found"
    phone_2_type.append(phone2_type)

    phone3_element = soup.find_all('span', itemprop='telephone')
    phone3 = phone3_element[2].get_text(strip=True) if len(phone3_element) >= 3 else "Not found"
    phone_3.append(phone3)

    phone3_type_element = soup.find_all('span', class_='smaller')
    phone3_type = phone3_type_element[2].get_text(strip=True) if len(phone3_type_element) >= 3 else "Not found"
    phone_3_type.append(phone3_type)
    # Append the data to the lists
    addresses.append(address)  # Corrected list name
    titles.append(title)
    ages.append(", ".join(age_text_list))

# Create the DataFrame
df_finalized = pd.DataFrame({"Name": titles, "Age": ages, "Address": addresses,"City": cities,
    "State": states,
    "Zip Code": zip_codes,"Home Specs": home_specs,
    "Residential Area": residential_areas,
    "Living Time Period": living_time_periods,
                  "Phone1":phone_1,
                  "Phone1 Type":phone_1_type,
                  "Phone2":phone_2,
                  "Phone2 Type":phone_2_type,
                  "Phone3":phone_3,
                  "Phone3 Type":phone_3_type,})  # Added "Address" column

# Print the DataFrame

df_finalized.to_excel("./Ready data file.xlsx")

df_finalized