In [1]:
import random
import time
from datetime import date
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_columns', None)

## 生成每个页面的链接

通过链家网的搜索结果可以看出，二手房源数量有3万多个，由于网站最多只显示100个页面的房源，每个页面30个房源，所以无法采集到厦门所有的二手房数据，这里采用分区域采集，思明湖里集美海沧，这4个区的房源数量都超过3000个，但只能采集到3000个，翔安同安的房源都可以采集到。

In [2]:
# 定义函数，生成网址
def make_urls(location, n):
    '''
    location：区域
    n：页面数量
    '''
    urls = []
    for i in range(1, n+1):
        url = 'https://xm.lianjia.com/ershoufang/{}/pg{}/'.format(location, i)
        urls.append(url)
    return urls


siming = make_urls('siming', 100)  # 思明有100个页面
huli = make_urls('huli', 100)  # 湖里有100个页面
jimei = make_urls('jimei', 100)  # 集美有100个页面
haicang = make_urls('haicang', 100)  # 海沧有100个页面
xiangan = make_urls('xiangan', 95)  # 翔安有95个页面
tongan = make_urls('tongan', 46)  # 同安有46个页面

# 所有网址汇总起来
xm_urls_list = siming + huli + jimei + haicang + xiangan + tongan
len(xm_urls_list)

541

## 处理 User-Agent 和 Cookie 

In [3]:
# 定义函数，用来处理User-Agent和Cookie
def ua_ck():
    '''
    网站需要登录才能采集，需要从Network--Doc里复制User-Agent和Cookie，Cookie要转化为字典
    '''

    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}

    cookies = 'select_city=350200; lianjia_uuid=c6bb527e-c73e-4c18-80a2-0e6c701d4f40; _smt_uid=5f649daa.382e09bd; UM_distinctid=174a107e3f6810-0ca2d4a8dbcaa8-333769-125f51-174a107e3f7c3e; _ga=GA1.2.1661642603.1600429486; _gid=GA1.2.527914271.1600429486; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1600429483,1600476524; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22174a107e556cb9-037ed71a329a2c-333769-1204049-174a107e557d11%22%2C%22%24device_id%22%3A%22174a107e556cb9-037ed71a329a2c-333769-1204049-174a107e557d11%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; lianjia_ssid=0857e41c-7ca8-24c2-7d18-e2b4b02e4d13; User-Realip=140.243.198.151; CNZZDATA1254525948=1557469094-1600424880-%7C1600525641; CNZZDATA1255847100=75444696-1600427554-%7C1600525837; CNZZDATA1255633284=1437618958-1600429218-%7C1600526300; CNZZDATA1255604082=1475741897-1600429003-%7C1600526307; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1600527516; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiYTE5NzE1ZDYzZjIzMDQ0NTQ5NWUzYmFlOTQxYTI1Nzg0YWZlZjk5MDI4NDJjYjU5YWE1YjEyYTg5YzYzODM5ODU3YmJlNTc1NGU3OTlkODU3NzA1NGM2ZGRkY2YxNWYxMjdiMGYyNGQ3ZjEzNTdmOWM1NDRmMjgxYjRhM2NlYjU2MDBlZGUxZDA0YjgxMWFmZTRhODk2ZGM3YjRlMmEwMWVjNDQ4YjQ1NjJmODQ2MWM2MDRiM2Q1N2NlMjk1YmZkYzk0YTFjMjUzYzkyYTM2NDBjOWUzNzg3ZjkyOWI2MmE2MTRlMzVhNmU4NDJmMDlkNWNhNGM4MDM0Y2ZhNzMzYjBlZTViYzg4OTA2YWNmNWNjM2Y3YmQ1MjQwMTA4MDVmMGY4ZTE0OTAzNjNjMjVmZDcwN2RhNWU3YjIzMTE2ZTliM2QyYjM5ZTJmYjQ3NmJkMWQ5MTA0NDA4ZTMzMzJmZVwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkZmYxZTM4NVwifSIsInIiOiJodHRwczovL3htLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0='

    # Cookie转化为字典
    cookies = cookies.split('; ')
    cookies_dict = {}
    for i in cookies:
        cookies_dict[i.split('=')[0]] = i.split('=')[1]

    return user_agent, cookies_dict

## 获取每个房源的链接

In [5]:
# 定义函数，获取房源链接
def get_urls(url, u_a, c_d):
    '''
    url：每一个页面的链接
    u_a：User-Agent
    c_d：cookies
    '''
    
    html = requests.get(url, headers=u_a, cookies=c_d)
    soup = BeautifulSoup(html.text, 'html.parser')
    items = soup.find('ul', class_='sellListContent').find_all(
        'div', class_='info clear')
    hrefs = []
    for item in items:
        href = item.find('div', class_='title').find('a')['href']
        hrefs.append(href)
    return hrefs

# get_urls('https://xm.lianjia.com/ershoufang/pg1/')

## 获取每个房源详细信息

In [6]:
def get_info(url, u_a, c_d):
    '''
    url：每一个页面的链接
    u_a：User-Agent
    c_d：cookies
    '''
    
    html = requests.get(url, headers=u_a, cookies=c_d)
    html.encoding = html.apparent_encoding  # 解决乱码的万金油方法
    soup = BeautifulSoup(html.text, 'html.parser')

    results = []
    info = {}

    a = soup.find('div', class_='overview').find('div', class_='content')
    info['房源链接'] = url
    info['总价'] = a.find('div', class_='price').find(
        'span', class_='total').text
    info['总价单位'] = a.find('div', class_='price').find(
        'span', class_='unit').text
    info['单价'] = a.find('div', class_='price').find('div', class_='text').find(
        'div', class_='unitPrice').find('span', class_='unitPriceValue').text
    info['小区名称'] = a.find('div', class_='aroundInfo').find(
        'div', class_='communityName').find('a').text
    info['所在大区'] = a.find('div', class_='aroundInfo').find(
        'div', class_='areaName').find('span', class_='info').find_all('a')[0].text
    info['所在详细区域'] = a.find('div', class_='aroundInfo').find(
        'div', class_='areaName').find('span', class_='info').find_all('a')[1].text

    b = soup.find('div', class_='m-content').find('div', class_='introContent')

    # 基本属性
    items1 = b.find('div', class_='base').find_all('li')
    for item in items1:
        info[item.text[:4]] = item.text[4:]

    # 交易属性
    items2 = b.find('div', class_='transaction').find_all('li')
    for item in items2:
        info[item.find_all('span')[0].text] = item.find_all('span')[1].text

    results.append(info)
    return results


# get_info('https://xm.lianjia.com/ershoufang/105105063079.html', u_a, c_d)

## 设置主函数

In [6]:
# def main():
#     house_urls = []  # 保存房源链接

#     print('开始采集房源链接！预计耗时1小时......')

#     house_urls_error = []

#     login = ua_ck()
#     u_a = login[0]
#     c_d = login[1]

#     for url in house_urls:
#         try:
#             href = get_urls(url, u_a, c_d)
#             house_urls.extend(href)
#         except:
#             house_urls_error.append(url)
#         time.sleep(random.random()*3)

#     print('房源链接采集完成！共采集{}条链接，开始采集房源信息，预计耗时10小时......'.format(len(house_urls)))

#     time.sleep(5)

#     data = []  # 保存房源详细信息
#     data_error = []
#     for house_url in house_urls:
#         try:
#             info = get_info(house_url, u_a, c_d)
#             data.extend(info)
#         except:
#             data_error.append(house_url)
#         time.sleep(random.random()*3)
#         print('已采集{}条数据'.format(len(data)))  # 监控采集进度

#     print('房源信息采集完成！！！总共采集了{}条数据'.format(len(data)))

#     return data, house_urls, house_urls_error, data_error

直接用主函数采集房源链接和房源信息，容易失败，建议分批采集

## 采集房源链接

In [7]:
# 开始采集数据
login = ua_ck()
u_a = login[0]
c_d = login[1]

In [13]:
house_urls = []  # 保存房源链接
error_url = []  # 保存采集失败的页面链接
for url in xm_urls_list:
    try:
        href = get_urls(url, u_a, c_d)
        house_urls.extend(href)
    except:
        error_data.append(url)
    time.sleep(random.random()*4)  # 设置时间间隔
    
    print('已采集{}条数据'.format(len(house_urls)))  # 监控采集进度

print('房源链接采集完成！！！总共采集了{}条链接'.format(len(house_urls)))

In [46]:
len(house_urls)

16219

In [27]:
error_url  # 查看采集失败的页面链接

[]

In [None]:
# 把链接先保存起来
save_list = pd.DataFrame({'url': house_urls})
save_list.to_excel(r'厦门二手房源链接_{}.xlsx'.format(date.today()))

## 分批采集房源信息

In [16]:
data = []  # 保存房源详细信息
error_data = []  # 保存采集失败的链接

# 采集前3000条
for house_url in house_urls[:3000]:
    try:
        info = get_info(house_url, u_a, c_d)
        data.extend(info)
    except:
        error_data.append(house_url)
        print('采集失败{}条'.format(len(error_data)))
    time.sleep(random.random()*3)  # 设置时间间隔
    
    print('已采集{}条数据'.format(len(data)))  # 监控采集进度

print('房源信息采集完成！！！总共采集了{}条数据'.format(len(data)))

房源信息采集完成！！！总共采集了1500条数据


In [21]:
time.sleep(200)

In [62]:
# 采集剩下的信息
for house_url in house_urls[3000:]:
    try:
        info = get_info(house_url, u_a, c_d)
        data.extend(info)
    except:
        error_data.append(house_url)
        print('采集失败{}条'.format(len(error_data)))
    time.sleep(random.random()*3)
    
    print('已采集{}条数据'.format(len(data)))

print('房源信息采集完成！！！总共采集了{}条数据'.format(len(data)))

In [65]:
data[:5]

13807

In [None]:
error_data

## 将结果保存为本地文件

In [25]:
df = pd.DataFrame(data)
df

Unnamed: 0,房源链接,总价,总价单位,单价,小区名称,所在大区,所在详细区域,房屋户型,所在楼层,建筑面积,户型结构,套内面积,建筑类型,房屋朝向,建筑结构,装修情况,梯户比例,配备电梯,挂牌时间,交易权属,上次交易,房屋用途,房屋年限,产权所属,抵押信息,房本备件,房源编码,用水类型,用电类型,燃气价格,别墅类型
0,https://xm.lianjia.com/ershoufang/105104628729...,660,万,51985元/平米,玉成豪园,思明,莲前,3室1厅1厨2卫,低楼层 (共19层),126.96㎡,平层,暂无数据,板楼,东南,钢混结构,精装,两梯两户,有,2020-06-18,商品房,2020-06-20,普通住宅,未满两年,共有,\n 有抵押 30万元 建设银...,已上传房本照片,00381041,,,,
1,https://xm.lianjia.com/ershoufang/105104643967...,789.7,万,53000元/平米,海豚湾,思明,会展中心,3室2厅1厨2卫,低楼层 (共32层),149㎡,错层,暂无数据,塔楼,东南,框架结构,精装,两梯两户,有,2020-06-20,商品房,暂无数据,普通住宅,暂无数据,共有,\n 无抵押\n ...,未上传房本照片,,,,,
2,https://xm.lianjia.com/ershoufang/105104685921...,1080,万,80796元/平米,蓝湾国际,思明,体育中心,3室2厅1厨1卫,低楼层 (共31层),133.67㎡,平层,暂无数据,板楼,东南,钢混结构,精装,三梯三户,有,2020-06-26,商品房,暂无数据,普通住宅,暂无数据,非共有,\n 无抵押\n ...,未上传房本照片,00248646,,,,
3,https://xm.lianjia.com/ershoufang/105104731779...,378,万,48055元/平米,长青路,思明,莲坂,3室2厅1厨1卫,高楼层 (共7层),78.66㎡,平层,暂无数据,板楼,南 北,框架结构,简装,一梯七户,无,2020-07-02,商品房,暂无数据,普通住宅,暂无数据,非共有,\n 无抵押\n ...,未上传房本照片,,,,,
4,https://xm.lianjia.com/ershoufang/105104745249...,360,万,41040元/平米,仙岳东村,思明,仙岳社区,3室1厅1厨1卫,中楼层 (共7层),87.72㎡,平层,暂无数据,板楼,南 北,混合结构,精装,一梯一户,无,2020-07-03,商品房,暂无数据,普通住宅,暂无数据,共有,\n 无抵押\n ...,未上传房本照片,00353448,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10420,https://xm.lianjia.com/ershoufang/105102630770...,2500,万,109228元/平米,中铁元湾,湖里,五缘湾,4室2厅1厨3卫,中楼层 (共10层),228.88㎡,平层,暂无数据,板楼,南 西南,钢混结构,精装,三梯两户,有,2019-07-05,商品房,暂无数据,普通住宅,暂无数据,非共有,\n 无抵押\n ...,未上传房本照片,00358258,,,,
10421,https://xm.lianjia.com/ershoufang/105103545098...,400,万,39790元/平米,金山小区,湖里,金山,3室2厅1厨2卫,高楼层 (共6层),100.53㎡,平层,暂无数据,板楼,北 南,钢混结构,简装,一梯两户,无,2019-12-14,商品房,2004-09-09,普通住宅,满五年,非共有,\n 无抵押\n ...,已上传房本照片,00408340,,,,
10422,https://xm.lianjia.com/ershoufang/105105206165...,137,万,26863元/平米,禹洲大学城,集美,集美其它,2室1厅1厨1卫,高楼层 (共34层),51㎡,平层,暂无数据,塔楼,北,钢混结构,精装,三梯六户,有,2020-08-30,商品房,暂无数据,普通住宅,暂无数据,共有,\n 有抵押 50万元 建设银...,未上传房本照片,00489649,,,,
10423,https://xm.lianjia.com/ershoufang/105105262197...,282,万,24737元/平米,凤凰花城,集美,锦园,3室2厅1厨2卫,低楼层 (共27层),114㎡,平层,暂无数据,板塔结合,东南,钢混结构,精装,两梯四户,有,2020-09-06,商品房,暂无数据,普通住宅,暂无数据,共有,\n 有抵押\n ...,未上传房本照片,00488028,,,,


In [26]:
# 保存为本地Excel文件，文件名包含采集时间
df.to_excel(r"厦门链家网二手房爬虫数据_{}.xlsx".format(date.today()))