In [3]:
from uszipcode import SearchEngine

from bs4 import BeautifulSoup

import pandas as pd

import requests
import math


def get_nearby_states(dist_max = 8, homezip = '24016'):
    search = SearchEngine()
    result = search.by_population(lower=0, upper=1000000000, returns = 1000000000)
    
    home = search.by_zipcode(homezip)

    data = list()
    for zipcode in result:
        data.append(
            {'major_city': zipcode.major_city,
             'state': zipcode.state,
             'lat': zipcode.lat,
             'lon': zipcode.lng,
             'dist': math.sqrt((zipcode.lat - home.lat)**2 + (zipcode.lng - home.lng)**2)
            }
        )
    
    df = pd.DataFrame(data = data) 
    
    zips = df[df['dist'] < dist_max]
    
    return list(set(zips['state']))

    
def get_site_urls():

    url = 'https://www.craigslist.org/about/sites'
    
    headers = {'referer': 'https://usa.fishermap.org/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
    
    r = requests.get(url,headers=headers)

    soup = BeautifulSoup(r.content)

    non_cont_us_str = str(soup).split('\n<h4>Territories<')[1]

    urls = list()

    for t in soup.findAll('li'):
        if t.find('a'):
            s = t.find('a').attrs['href']
            if s.endswith('.org') and s not in non_cont_us_str:
                urls.append(s)
                
    return urls


def get_state_urls(state):
    url = f'https://geo.craigslist.org/iso/us/{state}'
    headers = {'referer': 'https://usa.fishermap.org/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
    r = requests.get(url,headers=headers)

    soup = BeautifulSoup(r.content)

    badwords = [
        'help',
        'safety',
        'privacy',
        'feedback',
        'terms',
        'about',
        'craigslist app',
        'cl is hiring'
     ]

    urls = list()

    for i in [x.find('a') for x in soup.findAll('li')]:
        if i:
            s = i.attrs['href']
            if i.string not in badwords and s.endswith('.org'):
                if s.startswith('http'):
                    urls.append(i.attrs['href'])
                else:
                    urls.append('http:' + i.attrs['href'])
            
    
    return urls


def get_soup(url):
    
    url = f'{url}/search/sss?query=pedal+kayak&min_price=&max_price='
    headers = {'referer': 'https://usa.fishermap.org/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
    r = requests.get(url,headers=headers)

    soup = BeautifulSoup(r.content)
    
    return soup

def conds(t, s, a = 'class'):
    return t.attrs and len(t.attrs[a]) > 0 and t.attrs[a][0] == s
    
def get_raw_data(soup):
    
    raw_data = list()
    for x in soup.findAll('li'):
        if conds(x,'result-row'):
            raw_data.append(x)
    
    return raw_data

def add_rows(raw_data):
    
    rows = list()
    for row in raw_data:


        datetime = row.find('time').attrs['datetime']
        
        title_str = ''
        dollar_str = ''
        url = ''
        
        for x in row.findAll('a'):

            if conds(x, 'result-title'):
                title_str = x.string

            if conds(x, 'result-image') and x.find('span'):

                dollar_str = x.find('span').string

                url = x.attrs['href']

    
        
        nearby = ''
        
        for x in row.findAll('span'):
            if conds(x, 'nearby'):
                nearby = x.attrs['title']


        row = {
            'datetime': datetime,
            'title': title_str,
            'amount': dollar_str,
            'nearby': nearby,
            'url': url
        }


        rows.append(row)

    return rows


US_URLS = get_site_urls()

In [None]:
# rows = list()

# for s in US_URLS:
#     soup = get_soup(s)
#     r = add_rows(get_raw_data(soup))
#     rows += r

# df = pd.DataFrame(data=rows)
# df.to_csv('all_us.csv')

In [5]:
rows = list()

states = ['VA', 'MD']
nearby_urls = list()
for s in states:
    nearby_urls += get_state_urls(s)

for s in nearby_urls:
    print(s)
    try:
        soup = get_soup(s)
        r = add_rows(get_raw_data(soup))
        rows += r
    except:
        print(f'Failed URL: {s}')
        
df = pd.DataFrame(data=rows).drop_duplicates()

df['amount'] = df['amount'].apply(lambda x: x.replace('$', '').replace(',', ''))
df['amount'] = pd.to_numeric(df['amount'])

df = df[df['amount'] > 0]

https://charlottesville.craigslist.org
https://danville.craigslist.org
https://easternshore.craigslist.org
https://fredericksburg.craigslist.org
https://harrisonburg.craigslist.org
https://lynchburg.craigslist.org
https://blacksburg.craigslist.org
https://norfolk.craigslist.org
https://richmond.craigslist.org
https://roanoke.craigslist.org
https://swva.craigslist.org
https://winchester.craigslist.org
https://annapolis.craigslist.org
https://baltimore.craigslist.org
https://chambersburg.craigslist.org
https://easternshore.craigslist.org
https://frederick.craigslist.org
https://smd.craigslist.org
https://westmd.craigslist.org


In [15]:
df.to_csv('close_us.csv')