In [1]:
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup
from zenrows import ZenRowsClient
import re

# Read the DataFrame from the provided Excel file
df = pd.read_excel("./Project EXCEL FAST PEOPLE SEARCH (2).xlsx")

# Create an empty DataFrame to store the URLs
url_df = pd.DataFrame(columns=['URL'])

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    # Get the address components from the DataFrame
    street_address = row['Mailing Address']
    city = row['Mailing City']
    state = row['Mailing State']
    
    # Format the address components for the URL
    formatted_street_address = urllib.parse.quote(street_address)
    formatted_citystatezip = urllib.parse.quote(f"{city},{state}")
    
    # Construct the URL
    url = f"https://www.truepeoplesearch.com/resultaddress?streetaddress={formatted_street_address}&citystatezip={formatted_citystatezip}"
    
    # Create a temporary DataFrame for the URL
    temp_df = pd.DataFrame({'URL': [url]})
    
    # Concatenate the temporary DataFrame with the main DataFrame
    url_df = pd.concat([url_df, temp_df], ignore_index=True)

# Save the DataFrame to a CSV file
url_df.to_csv("urls.csv", index=False)

# Create a ZenRowsClient instance
client = ZenRowsClient("840911f2101dafeeb3b4c3f882bd083d4673a02e")

# Create an empty list to store the matched records
matched_records = []

# Iterate over the URLs in 'url_df' and compare with the corresponding entries in the 'short_data (1) (2)' DataFrame
for index, row in url_df.iterrows():
    url = row['URL']
    params = {"premium_proxy": "true"}
    
    try:
        # Perform the request to fetch the HTML content
        response = client.get(url, params=params)
        response.raise_for_status()  # Check for any HTTP errors
        
        # Assume `html` contains the HTML content of the page
        html = response.text
        
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find all <div> elements with class="col-md-8"
        div_col_md_8_list = soup.find_all('div', class_='col-md-8')
        
        # Extract the text from each <div> element with class="h4" inside the <div class="col-md-8">
        h4_texts = []
        for div_col_md_8 in div_col_md_8_list:
            h4_div = div_col_md_8.find('div', class_='h4')
            if h4_div is not None:
                h4_text = h4_div.get_text().strip()
                h4_texts.append(h4_text)
        
        # Find all elements with the data-detail-link attribute
        elements = soup.find_all(attrs={'data-detail-link': True})
        
        # Extract the href attribute from each element
        links = []
        for element in elements:
            href_link = element.get('data-detail-link')
            links.append(href_link)
        
        # Create a dictionary with the extracted data
        data = {
            "Name": h4_texts,
            "Link": links
        }
        
        # Create the DataFrame for the extracted data
        df_extracted = pd.DataFrame(data)
        
        # Get the corresponding entry from the 'short_data (1) (2)' DataFrame
        df_short_data_entry = df.loc[index]

        # Get the first name from the 'short_data (1) (2)' DataFrame entry and strip whitespaces
        owner1_first_name = df_short_data_entry['Owner 1 First Name']
        if not isinstance(owner1_first_name, float):
            owner1_first_name = owner1_first_name.strip()
        else:
            owner1_first_name = "Not found"
        
        owner2_first_name = df_short_data_entry['Owner 2 First Name']
        if not isinstance(owner2_first_name, float):
            owner2_first_name = owner2_first_name.strip()
        else:
            owner2_first_name = "Not found"

        # Filter the extracted DataFrame to match the first name
        if owner1_first_name and not isinstance(owner1_first_name, float):
            # Filter using regular expression to match the first name
            regex = re.compile(r"\b" + re.escape(owner1_first_name.lower()) + r"\b")
            df_matched_records = df_extracted[df_extracted['Name'].str.lower().str.contains(regex)]
        elif owner2_first_name:
            # Filter using regular expression to match the first name
            regex = re.compile(r"\b" + re.escape(owner2_first_name.lower()) + r"\b")
            df_matched_records = df_extracted[df_extracted['Name'].str.lower().str.contains(regex)]
        else:
            print("Not found")
            df_matched_records = pd.DataFrame(columns=["Name", "Link"])  # Create an empty DataFrame

        # Print the matched records to check
        print(f"Extracted Data for URL {index}:\n{df_extracted}")
        print(f"Matched Records for URL {index}:\n{df_matched_records}")

        # Append the matched records to the main list
        matched_records.append(df_matched_records)
        
    except Exception as e:
        # If there's an error with the current URL, handle it here
        # You can print an error message or log the error for further analysis
        print(f"Error occurred for URL {index}: {e}")
        # You can choose to skip this URL and continue with the next one, or take any other appropriate action

# Concatenate all matched records into the final DataFrame
df_final = pd.concat(matched_records, ignore_index=True)

# Print the matched records and the final DataFrame
print("Matched Records:")
print(df_final)

df_final.to_csv("./final_links.csv")

# Rest of the code...
# (Continue with the rest of the program as before)


Extracted Data for URL 0:
                  Name                                Link
0   Bonnie J McCartney   /find/person/pnl6469849ru6l04n00u
1   Beverly J Johnston  /find/person/px6u2n8rl0l022n026042
2         Levi G Burns  /find/person/pxlu244u88ru4n68n48ul
3      Mark Levi Baird   /find/person/pnu9uuu69244ull6u440
4  Phillip D McCartney   /find/person/p69rn0r042u42uru8096
5  Richard P McCartney   /find/person/pxr688uln6rr88229424
Matched Records for URL 0:
Empty DataFrame
Columns: [Name, Link]
Index: []
Extracted Data for URL 1:
                  Name                                Link
0    Deirdre F Johnson  /find/person/px2490l4unnl8lrr9866n
1         Renee H Munz   /find/person/p2484r24222ll2800r89
2       William J Munz    /find/person/p2269u06266n22llun4
3    David P Cooprider    /find/person/p4ln2nlrl66nr220009
4   Lillie J Cooprider   /find/person/p448n860u242ur0n02r2
5   Francisco R Garcia   /find/person/pn942r4unnrnn90n8nnl
6         William Manz  /find/person/pxlr9048nl

Extracted Data for URL 12:
                  Name                                Link
0     Daniel R Sanchez    /find/person/p224624248u24l486l4
1     Angela Y Sanchez  /find/person/px6nnl9nl9n6n0l99080r
2     Briana A Sanchez  /find/person/pxu8r62ru80u28r292u20
3     Joshua G Sanchez   /find/person/pl088099u9lu0nlr6u48
4         Luis Sanchez   /find/person/pxnuu099lu6n2846l6n0
5    Elaine T Williams  /find/person/pxn0r9884u48094n4r4l2
6         Luis Persaud   /find/person/p46669lur04rrun0n08l
7     Samantha Morales  /find/person/px244u00449u0u9r2969r
8         Rose Morales   /find/person/pr68498904800l4n2u60
9  Luis Thomas Sanchez  /find/person/px488l9982un8un9999r9
Matched Records for URL 12:
                  Name                                Link
4         Luis Sanchez   /find/person/pxnuu099lu6n2846l6n0
6         Luis Persaud   /find/person/p46669lur04rrun0n08l
9  Luis Thomas Sanchez  /find/person/px488l9982un8un9999r9
Extracted Data for URL 13:
                   Name          

Extracted Data for URL 23:
                  Name                                Link
0        Martha M Lott   /find/person/p80460290220828un8l9
1    Sparkman Christle   /find/person/p6096ulu942u6nn69u99
2    Sylvester Lott Jr  /find/person/pxrur64nrr980llnrr404
3       Sylvester Lott  /find/person/px9lr028l68r8n8n424l4
4          Sylvia Lott  /find/person/pxu69r0406l996u0990l2
5       Delia Sparkman   /find/person/p2uuln2nr9n2u680urrl
6          Sylvia Lott   /find/person/p2028r02nrr06r24uu9l
7       Rachel Pittman  /find/person/px8nn9l9ln9629046902l
8        Paul Prescott  /find/person/px89u6r2ul9rnn899lu24
9  Percy P Williams Sr   /find/person/pn6266nl49869uu20ur4
Matched Records for URL 23:
            Name                               Link
0  Martha M Lott  /find/person/p80460290220828un8l9
Extracted Data for URL 24:
                Name                                Link
0  Ruben D Mendez Sr  /find/person/pxur4lrl99n906nlr96l0
1       Julio E Ruiz   /find/person/p66url22nu9l448

Extracted Data for URL 34:
                  Name                                Link
0        James P Conte   /find/person/p88r66n09969u24lllrn
1    Victoria S Beaver    /find/person/p4l08u94242rr2l92u4
2      Brian L Daniels  /find/person/pxlrl8l99688r62r96660
3      Heather V Conte   /find/person/pr0un282lll8ur8nlrun
4     Cynthia B Hammen  /find/person/px200luu49499lr4ll44u
5  Velma L Whittington   /find/person/prl08u6ul2n0n44nnlr6
6      Brian C Daniels  /find/person/px6442422040rnu9ulnn2
7     Brenda J Daniels   /find/person/p84nlnnr099lu029u080
8   Robert O Armstrong  /find/person/px4629l04020nu0l0l9u4
9     Raymond D Daniel  /find/person/px2996u04ur49442r009u
Matched Records for URL 34:
Empty DataFrame
Columns: [Name, Link]
Index: []
Extracted Data for URL 35:
                        Name                                Link
0          Michelle D Melton  /find/person/px88280444l606u4uunr8
1          Shahaan A Bennett    /find/person/p2n2408r499n9002008
2              Louis R Cla

Extracted Data for URL 46:
                              Name                                Link
0                 Ralph J Bienaime   /find/person/pllrlulun6046u800u94
1                Kayla M Bien Aime   /find/person/pnnrll88ll4nnn6n6lu8
2              Francisco Maldonado  /find/person/pxun2ru66n8n2unu98r00
3                  Aime Kayla Bien  /find/person/px8r008u24l4l06n68rl6
4              Ralph J Bienaime Jr    /find/person/p8n44r0n0ur4l984nn0
5              Christopher E Perez  /find/person/pxnrlu92684nu2lr88uuu
6                 Ralph J Bienaime   /find/person/pu89988n6864r2u4l88u
7  Yessenia Sieliciana Ramos-colon  /find/person/pxrn686n20r6n0209r406
8                    Yanira Aviles  /find/person/pxun98n6r88024rl99082
9                     Mercy Alonso  /find/person/px6ln2r00u008286r22nu
Matched Records for URL 46:
                  Name                                Link
2  Francisco Maldonado  /find/person/pxun2ru66n8n2unu98r00
Extracted Data for URL 47:
                 Na

In [2]:
# Read the 'final_url' DataFrame from the CSV file and set the index to be the first column
final_url = pd.read_csv("./final_links.csv", index_col=0)

# Drop the 'Name' column from the DataFrame
final_url.drop("Name", axis=1, inplace=True)

# Add "https://www.truepeoplesearch.com" to each URL
final_url['Link'] = "https://www.truepeoplesearch.com" + final_url['Link']

# Print the updated DataFrame
print(final_url)

final_url.to_csv("./final_urls_ready.csv")



                                                 Link
0   https://www.truepeoplesearch.com/find/person/p...
1   https://www.truepeoplesearch.com/find/person/p...
2   https://www.truepeoplesearch.com/find/person/p...
3   https://www.truepeoplesearch.com/find/person/p...
4   https://www.truepeoplesearch.com/find/person/p...
5   https://www.truepeoplesearch.com/find/person/p...
6   https://www.truepeoplesearch.com/find/person/p...
7   https://www.truepeoplesearch.com/find/person/p...
8   https://www.truepeoplesearch.com/find/person/p...
9   https://www.truepeoplesearch.com/find/person/p...
10  https://www.truepeoplesearch.com/find/person/p...
11  https://www.truepeoplesearch.com/find/person/p...
12  https://www.truepeoplesearch.com/find/person/p...
13  https://www.truepeoplesearch.com/find/person/p...
14  https://www.truepeoplesearch.com/find/person/p...
15  https://www.truepeoplesearch.com/find/person/p...
16  https://www.truepeoplesearch.com/find/person/p...
17  https://www.truepeoplese

In [3]:
# Read the DataFrame from the provided CSV file
url_df = pd.read_csv("./final_urls_ready.csv")

# Initialize empty lists to store the data
titles = []
ages = []
addresses = []  # Corrected list name
cities = []  # Add new list for city
states = []  # Add new list for state
zip_codes = []  # Add new list for zip code
home_specs = []  # Add new list for home specs
residential_areas = []  # Add new list for residential area
living_time_periods = []  # Add new list for living time period
phone_1=[]
phone_1_type=[]
phone_2=[]
phone_2_type=[]
phone_3=[]
phone_3_type=[]

# Iterate over the URLs in the DataFrame
for url in url_df['Link']:
    # Make a request using ZenRowsClient
    response = client.get(url, params=params)

    # Parse the HTML content using BeautifulSoup
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    elements = soup.find_all(class_="content-container container-fluid row pl-0 pr-0")
    if len(elements) >= 2:
        element2 = elements[1]
        element_depth = element2.find(class_="content-center")
        element_more_depth = element_depth.find(class_="card card-body shadow-form pt-2")
        element_more_more_depth = element_more_depth.find(class_="row pl-md-2")
        element_more_more_more_depth = element_more_more_depth.find(class_="col")

        title = element_more_more_more_depth.find("h1").get_text(strip=True) if element_more_more_more_depth else "Not found"
        age_text_list = [span.get_text(strip=True) for span in element_more_more_more_depth.find_all("span")] if element_more_more_more_depth else []
    else:
        title = "Not found"
        age_text_list = []

    address_element = soup.find('span', itemprop='streetAddress')

    # Get the text of the element (address) or set to "Not found" if element is not found
    address = address_element.get_text(strip=True) if address_element else "Not found"
    # Additional elements for city, state, zip code, home specs, area, living time period, and phone numbers
    city_element = soup.find('span', itemprop='addressLocality')
    city = city_element.get_text(strip=True) if city_element else "Not found"
    cities.append(city)

    state_element = soup.find('span', itemprop='addressRegion')
    state = state_element.get_text(strip=True) if state_element else "Not found"
    states.append(state)

    zip_element = soup.find('span', itemprop='postalCode')
    zip_code = zip_element.get_text(strip=True) if zip_element else "Not found"
    zip_codes.append(zip_code)
    
    home_specs_element = soup.find('span', class_="dt-sb")
    home_specs_value = home_specs_element.get_text(strip=True) if home_specs_element else "Not found"
    home_specs.append(home_specs_value)

    area_elements = soup.find_all('span', class_="dt-sb")
    residential_area = area_elements[1].get_text(strip=True) if len(area_elements) >= 2 else "Not found"
    residential_areas.append(residential_area)

    living_time_period = area_elements[2].get_text(strip=True) if len(area_elements) >= 3 else "Not found"
    living_time_periods.append(living_time_period)

    phone1_element = soup.find('span', itemprop='telephone')
    phone1 = phone1_element.get_text(strip=True) if phone1_element else "Not found"
    phone_1.append(phone1)

    phone_type_element = soup.find('span', class_="smaller")
    phone_type = phone_type_element.get_text(strip=True) if phone_type_element else "Not found"
    phone_1_type.append(phone_type)

    phone2_element = soup.find_all('span', itemprop='telephone')
    phone2 = phone2_element[1].get_text(strip=True) if len(phone2_element) >= 2 else "Not found"
    phone_2.append(phone2)

    phone2_type_element = soup.find_all('span', class_='smaller')
    phone2_type = phone2_type_element[1].get_text(strip=True) if len(phone2_type_element) >= 2 else "Not found"
    phone_2_type.append(phone2_type)

    phone3_element = soup.find_all('span', itemprop='telephone')
    phone3 = phone3_element[2].get_text(strip=True) if len(phone3_element) >= 3 else "Not found"
    phone_3.append(phone3)

    phone3_type_element = soup.find_all('span', class_='smaller')
    phone3_type = phone3_type_element[2].get_text(strip=True) if len(phone3_type_element) >= 3 else "Not found"
    phone_3_type.append(phone3_type)
    # Append the data to the lists
    addresses.append(address)  # Corrected list name
    titles.append(title)
    ages.append(", ".join(age_text_list))

# Create the DataFrame
df_finalized = pd.DataFrame({"Name": titles, "Age": ages, "Address": addresses,"City": cities,
    "State": states,
    "Zip Code": zip_codes,"Home Specs": home_specs,
    "Residential Area": residential_areas,
    "Living Time Period": living_time_periods,
                  "Phone1":phone_1,
                  "Phone1 Type":phone_1_type,
                  "Phone2":phone_2,
                  "Phone2 Type":phone_2_type,
                  "Phone3":phone_3,
                  "Phone3 Type":phone_3_type,})  # Added "Address" column

# Print the DataFrame

df_finalized.to_excel("./Ready data file.xlsx")

df_finalized

Unnamed: 0,Name,Age,Address,City,State,Zip Code,Home Specs,Residential Area,Living Time Period,Phone1,Phone1 Type,Phone2,Phone2 Type,Phone3,Phone3 Type
0,Renee H Munz,Age 62 (Oct 1960),9055 SW 102nd Pl,Ocala,FL,34481,"2 Bed | 2 Bath | 1,328 Sq Ft | Built 1983",Marion County,(Apr 2011 - Jul 2023),(352) 205-0927,Wireless,(352) 414-4295,Landline,(352) 347-2757,Landline
1,William E Collado,Age 75 (Jul 1947),2534 10th Ave N #114,Lake Worth,FL,33461,Palm Beach County,(Jan 2023 - Jul 2023),Possible Primary Phone,(561) 502-6293,Wireless,(561) 737-4174,Landline,(561) 364-2437,Landline
2,Bernice Wooten,Age Unknown,1665 40th St,West Palm Beach,FL,33407,"3 Bed | 2 Bath | 2,135 Sq Ft | Built 1969",Palm Beach County,(Sep 2005 - Jul 2023),Not found,Not found,Not found,Not found,Not found,Not found
3,Carlton C Jones,Age 46 (Feb 1977),5422 Marcia Pl,West Palm Beach,FL,33407,Palm Beach County,(May 1995 - Jul 2023),Possible Primary Phone,(561) 707-5911,Wireless,(561) 686-0930,Landline,(240) 988-1499,Wireless
4,Emma D Paulk,Age 76 (Apr 1947),1161 W 4th St,Riviera Beach,FL,33404,"3 Bed | 2 Bath | 1,777 Sq Ft | Built 1981",Palm Beach County,(Jan 1978 - Jul 2023),(561) 667-2794,Wireless,(561) 667-2792,Wireless,(561) 615-8241,Landline
5,James L Lacouture,Age 50 (Sep 1972),651 Everest Rd,Venice,FL,34293,"3 Bed | 2 Bath | 1,060 Sq Ft | Built 1977",Sarasota County,(Sep 2006 - Jul 2023),(941) 493-2608,Landline,(941) 716-7163,Wireless,(941) 727-0059,Landline
6,Jerline S Weems,Age 94 (Dec 1928),8325 Endive Ave,Tampa,FL,33619,"2 Bed | 1 Bath | 1,084 Sq Ft | Built 1960",Hillsborough County,(Jun 1975 - Jul 2023),(813) 677-3682,Landline,(813) 671-8231,Landline,(786) 286-1908,Wireless
7,Connie W Seals,Age 76 (Jul 1946),3108 W Ballast Point Blvd,Tampa,FL,33611,"2 Bed | 1 Bath | 1,190 Sq Ft | Built 1960",Hillsborough County,(Jun 1996 - Jul 2023),(813) 805-7765,Landline,(813) 839-8268,Landline,(813) 837-7045,Landline
8,Luis Sanchez,Age 52 (Nov 1970),2709 N Morgan St,Tampa,FL,33602,2 Bed | 1 Bath | 896 Sq Ft | Built 1914,Hillsborough County,(Mar 1991 - Jul 2023),(813) 229-5973,Landline,(813) 888-6544,Landline,(352) 567-3931,Landline
9,Luis Persaud,Age 33 (Sep 1989),919 S Heron Cir,Winter Haven,FL,33884,"3 Bed | 2 Bath | 1,908 Sq Ft | Built 1980",Polk County,(Aug 2009 - Jul 2023),(916) 698-2804,Wireless,(813) 229-5973,Landline,(813) 391-6072,Wireless
