# Cragslist Platform Notes

### get_links()

`lansing`
- https://lansing.craigslist.org/search/cta?postal=48933&search_distance=200#search=1~gallery~0~0

- https://lansing.craigslist.org/search/cta?postal=48933&search_distance=200#search=1~gallery~1~0

`UP`
- https://up.craigslist.org/search/cta?postal=49855&search_distance=200

other useful paramaters
- `bundleDuplicates`
- `postedToday` - might be tricky if it's not 24 hour based, crawl all and filter at the moment
- `purveyor` - capturing in get_data() to save a listing call

- https://lansing.craigslist.org/search/cta?auto_fuel_type=1&auto_fuel_type=2&auto_fuel_type=3&auto_fuel_type=4&bundleDuplicates=1&postal=48933&postedToday=1&purveyor=dealer&search_distance=200#search=1~gallery~0~0

In [1]:
from util import get_soup
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import csv


def get_links(date, city, zip_code, radius, max_page, ev=False):
    
    output_file = f'./data/{date}_{city}_links_p{max_page}.csv'
    with open(output_file, 'w') as f:
        wtr = csv.writer(f)
        wtr.writerow(('url', 'title'))

        for page in range(max_page):
            if ev:
                url = f'https://{city}.craigslist.org/search/cta?auto_fuel_type=4&bundleDuplicates=1' \
                      f'&postal={zip_code}&search_distance={radius}#search=1~gallery~{page}~0'
            else:
                url = f'https://{city}.craigslist.org/search/cta?bundleDuplicates=1' \
                      f'&postal={zip_code}&search_distance={radius}#search=1~gallery~{page}~0'

            #soup = get_soup(url)
            data_res = requests.get(url) 
            if data_res.status_code != 200:
                print(f'request failed: {data_res.reason}')
                # ADD SAVING FAILED SOUP
            else:
                #print(f'html retrieved: {url}')
                soup = BeautifulSoup(data_res.content, 'html.parser')

                cars_css = 'li[class="cl-static-search-result"] a'
                cars = soup.select(cars_css) 
                if cars != []:
                    #print(cars[:2])
                    buffer = [(car.get('href'), car.select_one('div.title').text) for car in cars]    #### MAKE SET   
                    wtr.writerows(buffer)
                    print(f'page {page}: {len(buffer)} links added to {output_file}')
                else:
                    print(f'ERROR: css element for car postings {cars_css} is not captured')
            
                print(f'finished writing {output_file}')
    


### get_data()

In [2]:
import json
import pandas as pd
from util import get_soup, get_soup_text, append_to_json
import traceback
import re

# Parse a sales link for the data we require
def get_sales_data(link):
    soup = get_soup(link)
    
    info = {}
    # adding post metadata - id, timestamp, expire time
    id_css = 'div[class="postinginfos"] p[class="postinginfo"]'
    info['post_id'] = soup.select_one(id_css).text.split(': ')[1]

    
    info['post_datetime'] = soup.time.get('datetime')
    try:
        info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
    except:
        # save page source ---- TO DO
        print(link)
        print(soup.head.find('meta', {'name': 'robots'}).get('content'))
    
    # get listing attributes
    info_list = get_soup_text(soup, 'div[class=mapAndAttrs] p[class=attrgroup] span')
    for item in info_list:
        sep_idx = item.find(':')
        if sep_idx > 0:
            info[item[:sep_idx].strip()] = item[sep_idx+1:].strip()
        else:
            if 'other' in info.keys():
                info['other'] += f'|{item}'
            else:
                info['other'] = item
                
    
    # adding price, listing geo attributes
    json_css = 'script[id="ld_posting_data"]'
    data = soup.select_one(json_css)
    if data is not None:
        info.update(json.loads(data.get_text(strip=True)))
        
    # adding seller type
    purveyor_css = 'li[class="crumb section"] a'
    text = soup.select_one(purveyor_css).get_text()
    if text is not None:
        info['seller_type'] = text.split(' ')[-1]
    

    # adding saler free-form text and url
    body_css = 'section[id="postingbody"]'
    text = soup.select_one(body_css).get_text()
    if text is not None:
        info['seller_notes'] = re.sub('[\\n]+', ' ', text).strip()

    info['url'] = link
    
    ### LOGIC FOR SAVING FAILED HTML

    return info

## Runs

### US runs

In [3]:
# config

city_dict = {'southbend': {'zip': 48226,
                         'max_page':1,
                         'radius': 50
                        },
              'sfbay': {'zip': 94103,
                     'max_page':1,
                     'radius': 50
                    }
            }

date = datetime.today().strftime('%Y_%m_%d')
    
for city, info in city_dict.items():
    get_links(date, city, info['zip'], info['radius'], info['max_page'], ev=True)

page 0: 15 links added to ./data/2023_09_25_southbend_links_p1.csv
finished writing ./data/2023_09_25_southbend_links_p1.csv
page 0: 284 links added to ./data/2023_09_25_sfbay_links_p1.csv
finished writing ./data/2023_09_25_sfbay_links_p1.csv


In [5]:
all_item_links = pd.read_csv('./data/2023_09_25_sfbay_links_p1.csv', 
                             usecols=['url']
                            )

for item_link in all_item_links.url:
    try:
        data_dict = get_sales_data(item_link)
        #print(f'data: {data_dict}')
        append_to_json(f'data/{date}_sfbay_data.json', data_dict)      # maybe get sales check post date or only new links from day before?
    except Exception as e:
        print(f'{item_link} failed')
        traceback.print_exc()
        print(e)

print('\ndone!')

https://sfbay.craigslist.org/eby/ctd/d/oakland-2015-nissan-leaf-electric/7668836564.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2017-fiat-500e-gray-black-loaded/7667537991.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2013-nissan-leaf-silver-black/7667537423.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2010-tesla-roadster-sport-30/7667537381.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2015-ford-focus-electric/7667534217.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/concord-used-2021-ford-mustang-mach/7667304071.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2017-nissan-leaf-white-black/7667090953.html
noindex
https://sfbay.craigslist.org/sby/ctd/d/san-jose-nissan-leaf-ev-33k-very-low/7666712519.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/oakland-2021-audi-tron-awd-all-wheel/7666619656.html
noindex
https://sfbay.craigslist.org/eby/ctd/d/albany-2016-nissan-leaf-black-black-58k/7665741886.html
noindex

done

In [6]:
all_item_links = pd.read_csv('./data/2023_09_25_southbend_links_p1.csv', 
                             usecols=['url']
                            )

for item_link in all_item_links.url:
    try:
        data_dict = get_sales_data(item_link)
        #print(f'data: {data_dict}')
        append_to_json(f'data/{date}_detriot_data.json', data_dict)      # maybe get sales check post date or only new links from day before?
    except Exception as e:
        print(f'{item_link} failed')
        traceback.print_exc()
        print(e)

print('\ndone!')

https://detroit.craigslist.org/wyn/ctd/d/redford-2018-tesla-75d-super-clean-must/7668580125.html
noindex
https://detroit.craigslist.org/wyn/ctd/d/redford-2021-tesla-plus-standard-range/7664123923.html
noindex

done!


### US run 1

In [7]:
# config

city_dict = {'southbend': {'zip': 48226,
                         'max_page':1,
                         'radius': 50
                        },
              'sfbay': {'zip': 94103,
                     'max_page':1,
                     'radius': 50
                    }
            }

date = datetime.today().strftime('%Y_%m_%d')
    
for city, info in city_dict.items():
    get_links(date, city, info['zip'], info['radius'], info['max_page'])

page 0: 325 links added to ./data/2023_08_20_southbend_links_p1.csv
finished writing ./data/2023_08_20_southbend_links_p1.csv
page 0: 357 links added to ./data/2023_08_20_sfbay_links_p1.csv
finished writing ./data/2023_08_20_sfbay_links_p1.csv


In [12]:
all_item_links = pd.read_csv('./data/2023_08_20_southbend_links_p1.csv', 
                             usecols=['url']
                            )

for item_link in all_item_links.url:
    try:
        data_dict = get_sales_data(item_link)
        #print(f'data: {data_dict}')
        append_to_json(f'data/{date}_detriot_data.json', data_dict)      # maybe get sales check post date or only new links from day before?
    except Exception as e:
        print(f'{item_link} failed')
        traceback.print_exc()
        print(e)

print('\ndone!')

https://detroit.craigslist.org/wyn/cto/d/detroit-06-chevy-trailblazer-lt-suv-4x4/7656800612.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://detroit.craigslist.org/wyn/ctd/d/redford-2021-chevrolet-equinox-lt-super/7656662867.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://detroit.craigslist.org/wyn/ctd/d/redford-2019-gmc-terrain-slt-super/7656657568.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://detroit.craigslist.org/wyn/ctd/d/redford-2020-tesla-long-range-super/7656655132.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://detroit.craigslist.org/wyn/ctd/d/redford-2012-ford-edge-limited-super/7656653104.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://detroit.craigslist.org/wyn/ctd/d/redford-2021-ram-1500-big-horn-super/7656650273.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/1502061816.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range



done!


In [11]:
all_item_links = pd.read_csv('./data/2023_08_20_sfbay_links_p1.csv', 
                             usecols=['url']
                            )

for item_link in all_item_links.url:
    try:
        data_dict = get_sales_data(item_link)
        #print(f'data: {data_dict}')
        append_to_json(f'data/{date}_sfbay_data.json', data_dict)      # maybe get sales check post date or only new links from day before?
    except Exception as e:
        print(f'{item_link} failed')
        traceback.print_exc()
        print(e)

print('\ndone!')

https://sfbay.craigslist.org/sby/ctd/d/santa-clara-2017-ford-mustang-gt/7657063577.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3319604147.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://sfbay.craigslist.org/pen/ctd/d/san-mateo-2008-toyota-avalon-limited/7657055313.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3319604147.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://sfbay.craigslist.org/eby/ctd/d/fairfield-2002-lexus-sc430-luxury-sport/7656966828.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3319604147.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://sfbay.craigslist.org/sby/cto/d/san-jose-1964-cutlass-f85/7656959725.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3319604147.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_46693/3195065662.py", line 17, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range



done!


### Lansing runs

In [28]:
# config

city_dict = {'lansing': {'zip': 48933,
                         'max_page':1
                        },
              'up': {'zip': 49855,
                       'max_page':1
                    }
            }

radius = 200

date = datetime.today().strftime('%Y_%m_%d')
    
for city, info in city_dict.items():
    get_links(date, city, info['zip'], radius, info['max_page'])

In [355]:
all_item_links = pd.read_csv('./data/2023_08_01_lansing_links_p3.csv', 
                             usecols=['url']
                            )

for item_link in all_item_links.url:
    try:
        data_dict = get_sales_data(item_link)
        #print(f'data: {data_dict}')
        append_to_json(f'data/{date}_lansing_data.json', data_dict)      # maybe get sales check post date or only new links from day before?
    except Exception as e:
        print(f'{item_link} failed')
        traceback.print_exc()
        print(e)

print('\ndone!')

https://chicago.craigslist.org/nch/ctd/d/kenilworth-1965-ford-thunderbird/7650101836.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/nch/ctd/d/libertyville-377-mo-ford-edge-titanium/7650020784.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/chc/cto/d/schaumburg-2013-volkswagen-jetta/7650000456.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 16, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://monroemi.craigslist.org/ctd/d/highland-park-2017-dodge-grand-caravan/7649992995.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/nch/ctd/d/kenilworth-1965-ford-thunderbird/7650101836.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/nch/ctd/d/libertyville-377-mo-ford-edge-titanium/7650020784.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/chc/cto/d/schaumburg-2013-volkswagen-jetta/7650000456.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 16, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://monroemi.craigslist.org/ctd/d/highland-park-2017-dodge-grand-caravan/7649992995.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/nch/ctd/d/kenilworth-1965-ford-thunderbird/7650101836.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/nch/ctd/d/libertyville-377-mo-ford-edge-titanium/7650020784.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'


https://chicago.craigslist.org/chc/cto/d/schaumburg-2013-volkswagen-jetta/7650000456.html failed
list index out of range


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 16, in get_sales_data
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
                          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


https://monroemi.craigslist.org/ctd/d/highland-park-2017-dodge-grand-caravan/7649992995.html failed
'NoneType' object has no attribute 'get_text'


Traceback (most recent call last):
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/3961468879.py", line 7, in <module>
    data_dict = get_sales_data(item_link)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0d/dxss4k9j18j9pg4ythbk89y40000gn/T/ipykernel_23206/273795941.py", line 34, in get_sales_data
    info.update(json.loads(data.get_text(strip=True)))
                           ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get_text'



done!


### Pagination Testing

In [52]:
def get_soup(url):
    data_res = requests.get(url) 
    if data_res.status_code != 200:
        print(f'request failed: {data_res.reason}')
    else:
        print(f'html retrieved: {url}')
        soup = BeautifulSoup(data_res.content, 'html.parser')
        return soup
#soup

In [54]:
cars_css = 'li[class="cl-static-search-result"] a'

In [62]:
url_1 = 'https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1~list~1~0'
soup = get_soup(url_1)
cars = soup.select(cars_css) 
cars[:2]

html retrieved: https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1~list~1~0


[<a href="https://chicago.craigslist.org/wcl/cto/d/hines-2006-cadillac-dts/7652250157.html">
 <div class="title">2006 Cadillac DTS</div>
 <div class="details">
 <div class="price">$5,000</div>
 <div class="location">
                         Westchester
                     </div>
 </div>
 </a>,
 <a href="https://cleveland.craigslist.org/cto/d/cleveland-1979-chevrolet-camaro/7652249578.html">
 <div class="title">1979 Chevrolet Camaro</div>
 <div class="details">
 <div class="price">$17,500</div>
 <div class="location">
                         Parma
                     </div>
 </div>
 </a>]

In [63]:
url_1 = 'https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1~list~2~0'
soup = get_soup(url_1)
cars = soup.select(cars_css) 
cars[:2]

html retrieved: https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1~list~2~0


[<a href="https://chicago.craigslist.org/wcl/cto/d/hines-2006-cadillac-dts/7652250157.html">
 <div class="title">2006 Cadillac DTS</div>
 <div class="details">
 <div class="price">$5,000</div>
 <div class="location">
                         Westchester
                     </div>
 </div>
 </a>,
 <a href="https://cleveland.craigslist.org/cto/d/cleveland-1979-chevrolet-camaro/7652249578.html">
 <div class="title">1979 Chevrolet Camaro</div>
 <div class="details">
 <div class="price">$17,500</div>
 <div class="location">
                         Parma
                     </div>
 </div>
 </a>]

In [61]:
url_1 = 'https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1'
soup = get_soup(url_1)
cars = soup.select(cars_css) 
cars[:2]

html retrieved: https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1


[<a href="https://milwaukee.craigslist.org/ctd/d/milwaukee-2018-isuzu-npr-hd-16-box/7652239352.html">
 <div class="title">2018 Isuzu NPR HD 16 Box</div>
 <div class="details">
 <div class="price">$27,999</div>
 <div class="location">
                         Milwaukee
                     </div>
 </div>
 </a>,
 <a href="https://detroit.craigslist.org/okl/cto/d/lincoln-park-jaguar/7652238832.html">
 <div class="title">Jaguar</div>
 <div class="details">
 <div class="price">$11,800</div>
 <div class="location">
                         Downriver
                     </div>
 </div>
 </a>]

In [49]:
date = datetime.today().strftime('%Y_%m_%d')
    
for city, info in city_dict.items():
    get_links(date, city, info['zip'], radius, info['max_page'])

html retrieved: https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&search_distance=200#search=1~gallery~0~0
[<a href="https://chicago.craigslist.org/nwc/ctd/d/chicago-2022-nissan-altima-for-285-mo/7652233232.html">
<div class="title">2022 Nissan Altima  for $285/mo BAD CREDIT &amp; NO MONEY DOWN</div>
<div class="details">
<div class="price">$285</div>
<div class="location">
                        BAD CREDIT OK!
                    </div>
</div>
</a>, <a href="https://chicago.craigslist.org/chc/cto/d/romeoville-2008-pontiac-torrent/7652232917.html">
<div class="title">2008 Pontiac torrent</div>
<div class="details">
<div class="price">$3,400</div>
<div class="location">
                        city of chicago
                    </div>
</div>
</a>]
page 0: 319 links added to ./data/2023_08_07_lansing_links_p3.csv
finished writing ./data/2023_08_07_lansing_links_p3.csv
html retrieved: https://lansing.craigslist.org/search/cta?bundleDuplicates=1&postal=48933&searc

KeyboardInterrupt: 

## get_links()

In [None]:

def get_search_page(city, zip_code, radius, page):
    
    url = f'https://{city}.craigslist.org/search/cta?postal={zip_code}&search_distance={radius}#search=1~gallery~{page}~0'
    response = requests.get(url) 
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # -------- get links.csv for search results --------
        cars_css = 'li[class="cl-static-search-result"] a'
        cars = soup.select(cars_css) 
        if cars != []:
            buffer = [car.get('href') for car in cars]     
            
            output_file = f'./data/{city}_links_p{page}.csv'
            with open(output_file, 'w') as f:
                wtr = csv.writer(f)
                wtr.writerow(('url'))
                wtr.writerows(buffer)
                print(f'finished writing {output_file}')
        else:
            print(f'ERROR: css element for car postings {cars_css} is no longer available')
        
#         # -------- get data.json for search results --------
#         json_css = 'script[id="ld_searchpage_results"]'
#         data = soup.select(json_css) 
#         if data != []:
#             json_blob = json.loads(data[0].text)
            
#             output_file = f'./data/{city}_data_p{page}.json'
#             with open(output_file, 'w') as f:
#                 json.dump(json_blob["itemListElement"], f)
#                 print(f'finished writing {output_file}')
#         else:
#             print(f'ERROR: css element for car posting JSON {json_css} is no longer available')
        
    else:
         print(response.status_code, response.reason) 

In [4]:
def get_search_page(city, zip_code, radius, page):
    
    url = f'https://{city}.craigslist.org/search/cta?postal={zip_code}&search_distance={radius}#search=1~gallery~{page}~0'
    resp = requests.get(url)
    
    return resp

In [51]:
response = get_search_page('lansing', city_zip['lansing'], radius, 0)

In [180]:
response.status_code

200

In [212]:
response.reason

'OK'

In [56]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

In [57]:
#len(soup.find_all('li', class_='cl-static-search-result')) 

288

In [58]:
#  <li class="cl-static-search-result" 
#elements = soup.find_all('li', class_='cl-static-search-result') #titlestring

In [173]:
#cars_css = 'li.cl-static-search-result'
cars_css = 'li[class="cl-static-search-result"] a'
cars = soup.select(cars_css) #titlestring li.cl-static-search-result

In [174]:
cars[:2]

[<a href="https://chicago.craigslist.org/chc/cto/d/chicago-2017-bmw-m240i-drive/7649950932.html">
 <div class="title">2017 BMW M240i X drive</div>
 <div class="details">
 <div class="price">$25,500</div>
 <div class="location">
                         Chicago
                     </div>
 </div>
 </a>,
 <a href="https://detroit.craigslist.org/wyn/cto/d/garden-city-2007-toyota-prius-hybrid/7649950719.html">
 <div class="title">2007 Toyota Prius Hybrid</div>
 <div class="details">
 <div class="price">$3,450</div>
 <div class="location">
                         Westland
                     </div>
 </div>
 </a>]

In [195]:
len(cars)

288

In [198]:
cars[-3:]

[<a href="https://fortwayne.craigslist.org/ctd/d/montpelier-2015-jeep-compass-4d-suv-4wd/7649878846.html">
 <div class="title">2015 Jeep Compass 4d SUV 4WD High Altitude</div>
 <div class="details">
 <div class="price">$11,695</div>
 <div class="location">
                         + AutoSmart Montpelier
                     </div>
 </div>
 </a>,
 <a href="https://akroncanton.craigslist.org/ctd/d/cleveland-2004-toyota-camry-le/7649878779.html">
 <div class="title">2004 TOYOTA CAMRY LE</div>
 <div class="details">
 <div class="price">$8,272</div>
 <div class="location">
                         + Cherry Auto Group
                     </div>
 </div>
 </a>,
 <a href="https://limaohio.craigslist.org/ctd/d/hamler-2019-ram-wd-crew-cab-big-horn/7649878742.html">
 <div class="title">2019 Ram 1500 4WD Crew Cab Big Horn/Lone Star</div>
 <div class="details">
 <div class="price">$35,995</div>
 <div class="location">
                         + AutoSmart Hamler
                     </div>
 </div>
 

In [209]:
if cars != []:
    buffer = [(car.get('href'), car.select('div.title')[0].text) for car in cars] 
    link_df = pd.DataFrame(buffer, columns=['url','title'])
else:
    print(f'WARNING: css element for car postings {cars_css} is no longer available')

In [201]:
if cars != []:
    buffer = []
    for car in cars:
        link = car.get('href')
        title = car.select('div.title')[0].text
        #price = car.select('div.price')[0].text
        #location = car.select('div.location')[0].text
        #print(link, title)
        buffer.append((link, title))
    link_df = pd.DataFrame(buffer, columns=['url','title'])
else:
    print(f'WARNING: css element for car postings {cars_css} is no longer available')

In [210]:
link_df.head()

Unnamed: 0,url,title
0,https://chicago.craigslist.org/chc/cto/d/chica...,2017 BMW M240i X drive
1,https://detroit.craigslist.org/wyn/cto/d/garde...,2007 Toyota Prius Hybrid
2,https://chicago.craigslist.org/chc/cto/d/carpe...,2017 Cadillac Escalade Luxury 3rd Row Navi w/B...
3,https://flint.craigslist.org/ctd/d/redford-202...,2021 Nissan SENTRA SV FOR ONLY
4,https://grandrapids.craigslist.org/ctd/d/redfo...,2019 Chevrolet EQUINOX LT FOR ONLY


In [202]:
link_df.head()

Unnamed: 0,url,title
0,https://chicago.craigslist.org/chc/cto/d/chica...,2017 BMW M240i X drive
1,https://detroit.craigslist.org/wyn/cto/d/garde...,2007 Toyota Prius Hybrid
2,https://chicago.craigslist.org/chc/cto/d/carpe...,2017 Cadillac Escalade Luxury 3rd Row Navi w/B...
3,https://flint.craigslist.org/ctd/d/redford-202...,2021 Nissan SENTRA SV FOR ONLY
4,https://grandrapids.craigslist.org/ctd/d/redfo...,2019 Chevrolet EQUINOX LT FOR ONLY


In [203]:
len(link_df)

288

In [204]:
json_css = 'script[id="ld_searchpage_results"]'
data = soup.select(json_css) 
if data != []:
    json_blob = json.loads(data[0].text)

In [205]:
pd.json_normalize(json_blob["itemListElement"])

Unnamed: 0,position,@type,item.image,item.name,item.description,item.offers.availableAtOrFrom.address.addressLocality,item.offers.availableAtOrFrom.address.@type,item.offers.availableAtOrFrom.address.addressRegion,item.offers.availableAtOrFrom.address.addressCountry,item.offers.availableAtOrFrom.address.streetAddress,item.offers.availableAtOrFrom.address.postalCode,item.offers.availableAtOrFrom.geo.longitude,item.offers.availableAtOrFrom.geo.@type,item.offers.availableAtOrFrom.geo.latitude,item.offers.availableAtOrFrom.@type,item.offers.@type,item.offers.priceCurrency,item.offers.price,item.@context,item.@type
0,0,ListItem,[https://images.craigslist.org/00G0G_arb5SHvM0...,2017 BMW M240i X drive,,Chicago,PostalAddress,IL,,,,-87.759598,GeoCoordinates,41.993000,Place,Offer,USD,25500.00,http://schema.org,Product
1,1,ListItem,[https://images.craigslist.org/00K0K_8ZrM1LrAh...,2007 Toyota Prius Hybrid,,Garden City,PostalAddress,MI,,,,-83.332331,GeoCoordinates,42.327153,Place,Offer,USD,3450.00,http://schema.org,Product
2,2,ListItem,[https://images.craigslist.org/00G0G_aV7krRQpR...,2017 Cadillac Escalade Luxury 3rd Row Navi w/B...,,Carpentersville,PostalAddress,IL,,,,-88.260596,GeoCoordinates,42.122998,Place,Offer,USD,18000.00,http://schema.org,Product
3,3,ListItem,[https://images.craigslist.org/01010_gWdln1TNs...,2021 Nissan SENTRA SV FOR ONLY,,Redford,PostalAddress,MI,,,,-83.321935,GeoCoordinates,42.441836,Place,Offer,USD,19970.00,http://schema.org,Product
4,4,ListItem,[https://images.craigslist.org/00O0O_9Kw2fqGqN...,2019 Chevrolet EQUINOX LT FOR ONLY,,Redford,PostalAddress,MI,,,,-83.321935,GeoCoordinates,42.441836,Place,Offer,USD,24870.00,http://schema.org,Product
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,262,ListItem,[https://images.craigslist.org/00000_3Xvb7PH74...,1966 Triumph Spitfire Mk2,,Kenosha,PostalAddress,WI,,,,-87.876203,GeoCoordinates,42.605801,Place,Offer,USD,2500.00,http://schema.org,Product
263,263,ListItem,[https://images.craigslist.org/00I0I_4SuFPAdnp...,2014 TOYOTA TACOMA *4WD*,,Cleveland,PostalAddress,OH,,,,-81.527413,GeoCoordinates,41.428321,Place,Offer,USD,27976.00,http://schema.org,Product
264,264,ListItem,[https://images.craigslist.org/01717_kKqR1xJ2j...,2015 Jeep Compass 4d SUV 4WD High Altitude,,Montpelier,PostalAddress,OH,,,,-84.614699,GeoCoordinates,41.598202,Place,Offer,USD,11695.00,http://schema.org,Product
265,265,ListItem,[https://images.craigslist.org/00R0R_3VKxf4PvI...,2004 TOYOTA CAMRY LE,,Cleveland,PostalAddress,OH,,,,-81.527413,GeoCoordinates,41.428321,Place,Offer,USD,8272.00,http://schema.org,Product


In [224]:
#print(soup.prettify())

## get_data()

In [16]:
data_url = 'https://lansing.craigslist.org/cto/d/lansing-2010-chevy-cobalt-runs-and/7652215941.html'
# "https://monroemi.craigslist.org/ctd/d/highland-park-2017-dodge-grand-caravan/7649992995.html"
#https://muncie.craigslist.org/ctd/d/daleville-2004-infinity-qx56-suv/7650052229.html

data_res = requests.get(data_url) 
if data_res.status_code == 200:
    print('html retrieved')
    soup = BeautifulSoup(data_res.content, 'html.parser')

html retrieved


In [6]:
data_res.status_code  # GONE - deleted post

410

In [279]:
import re
# Get all the text within elements found using search_str
def get_soup_text(soup: BeautifulSoup, search_str: str, one=False):
    if one:
        return format_str(soup.select_one(search_str).text)
    else:
        return list(map(lambda x: format_str(x.text), soup.select(search_str)))
    
    
# Replace newlines/tabs with the symbol |
def format_str(s):
    return re.sub("[\n\t\r]+", '|', s)

In [318]:
def get_data(soup, link):
    info = {}

    # adding post metadata - id, timestamp, expire time
    id_css = 'div[class="postinginfos"] p[class="postinginfo"]'
    info['post_id'] = soup.select_one(id_css).text.split(': ')[1]
    
    info['post_datetime'] = soup.time.get('datetime')
    info['post_expire'] = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]
    
    # get listing attributes
    info_list = get_soup_text(soup, 'div[class=mapAndAttrs] p[class=attrgroup] span')
    for item in info_list:
        sep_idx = item.find(':')
        if sep_idx > 0:
            info[item[:sep_idx].strip()] = item[sep_idx+1:].strip()
        else:
            if 'other' in info.keys():
                info['other'] += f'|{item}'   #Q for Alex !!! this seems to be make, model and year
            else:
                info['other'] = item
                
    
    # adding price, listing geo attributes
    json_css = 'script[id="ld_posting_data"]'
    data = soup.select_one(json_css)
    info.update(json.loads(data.get_text(strip=True)))
    
    # adding saler free-form text and url
    body_css = 'section[id="postingbody"]'
    text = soup.select_one(body_css).get_text()
    info['seller_notes'] = re.sub('[\\n]+', ' ', text).strip()

    info['url'] = link
  
    return info

In [319]:
pd.json_normalize(get_data(soup, data_url)).T

Unnamed: 0,0
post_id,7650052424
post_datetime,2023-08-01T19:11:56-0500
post_expire,2023-09-01T00:11:56Z
other,2016 toyota camry hybrid
condition,excellent
cylinders,4 cylinders
drive,fwd
fuel,hybrid
odometer,62000
paint color,grey


In [18]:
purveyor_css = 'li[class="crumb section"] a'
soup.select_one(purveyor_css)

<a href="/search/sss?purveyor=owner">for sale by owner</a>

In [15]:
purveyor_css = 'li[class="crumb section"] a'
text = soup.select_one(purveyor_css).get_text()
if text is not None:
    purveyor = text.split(' ')[-1]
    
purveyor

'dealer'

In [235]:
# time 
soup.time.get('datetime')

'2023-08-01T19:11:56-0500'

In [273]:
attr_css = 'div[class=mapAndAttrs] p[class=attrgroup] span'
soup.select(attr_css)

[<span><b>2016 toyota camry hybrid</b></span>,
 <span>condition: <b>excellent</b></span>,
 <span>cylinders: <b>4 cylinders</b></span>,
 <span>drive: <b>fwd</b></span>,
 <span>fuel: <b>hybrid</b></span>,
 <span>odometer: <b>62000</b></span>,
 <span>paint color: <b>grey</b></span>,
 <span>size: <b>full-size</b></span>,
 <span>title status: <b>clean</b></span>,
 <span>transmission: <b>automatic</b></span>,
 <span>type: <b>sedan</b></span>]

In [271]:
json_css = 'script[id="ld_posting_data"]'
data = soup.select_one(json_css)
json.loads(data.get_text(strip=True))

{'description': 'Clean Carfax &amp; Title! Rides and looks great! No accidents or smokers! Just had a mechanic inspect the whole vehicle and no issues. Hey large folks, 2016 was the last year Toyota offered the full size front seats! 43/39 City/Highway MPG. Would be a perfect choice for Uber/Lyft or delivery drivers.    Bluetooth Audio, Push to Start, Back up Camera, New Tires, Good Brakes, Good Battery, Two Original Key Fobs. Remote Keyless Entry, The car is in exceptional shape, it drives and shifts very well....',
 'image': ['https://images.craigslist.org/00v0v_iXpkgn8Q6lq_0CI0t2_600x450.jpg',
  'https://images.craigslist.org/00000_hTcrwavXJZP_0CI0s3_600x450.jpg',
  'https://images.craigslist.org/00000_hJdc7puPwM5_0tQ0t2_600x450.jpg',
  'https://images.craigslist.org/00303_gxXDEJwJcKi_0CI0t2_600x450.jpg',
  'https://images.craigslist.org/01616_gufBYJM5fXw_0ww0oo_600x450.jpg',
  'https://images.craigslist.org/00p0p_eVLOsaLIm9Q_0ww0oo_600x450.jpg',
  'https://images.craigslist.org/00i

In [265]:
id_css = 'div[class="postinginfos"] p[class="postinginfo"]'
soup.select_one(id_css).text.split(': ')[1]

'7650052424'

In [359]:
body_css = 'section[id="postingbody"]'
text = soup.select_one(body_css).get_text()
re.sub('[\\n]+', ' ', text).strip()



In [263]:
#soup.find_all('p', {'class':'postinginfo'}, string=r'post id:')

[]

In [317]:
unavail = soup.head.find('meta', {'name': 'robots'}).get('content').split(': ')[1]

In [320]:
#print(soup.head.prettify())

In [321]:
#print(soup.prettify())