In [None]:
#本代码主要是对链家网站上山东省济南市的数据进行爬取，从而得到一个csv文件，关于房价，户型之类的信息
#导入爬虫功能所需要的库
#beautifulsoup常用于解析html文档，
#lxml是一个快速、易用、内存占用低的Python库，用于处理XML和HTML
#matplotlib是一个绘图工具
#numpy是数值计算库
#pandas是数据分析工具）
#-*- coding: utf-8 -*-       
#:一种特殊的注释，用于定义源代码文件的字符编码
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from lxml import etree

def get_data(url, headers):
    datas = requests.get(url, headers)
    return datas.text


#解析省份，生成集合{}，其中包括一级页面中的全部省份
def analyse_province(data):
    bs_data = BeautifulSoup(data, 'lxml')
    provinces_data = bs_data.find_all('div', {'class': 'city_list_tit c_b'})  
    provinces = {}
    for i in range(len(provinces_data)):
        provinces[i] = (provinces_data[i].string)
    return provinces#返回省份信息


#解析山东省各地市的信息，生成集合
def analyse_city(data, province):
    bs_data = BeautifulSoup(data, 'lxml')
    p_data = bs_data.find_all('div', {'class': 'city_province'})[province]
    x_data = etree.HTML(str(p_data))
    city_list = x_data.xpath('//div[@class="city_province"]/ul/li/a/text()')
    urls = x_data.xpath('//div[@class="city_province"]/ul/li/a/@href')
    citys = {}#生成集合
    for i in range(len(city_list)):
        citys[i] = city_list[i]
        urls[i] = urls[i] + 'ershoufang/pg{}/'
    return citys, urls

#解析出该网站的二级页面总页数
def get_page(data):
    x_path = etree.HTML(data)
    total_page = x_path.xpath('//div[@class="page-box fr"]/div/@page-data')
    page = re.match(r'.*?talPage":(.*?),"curPage', str(total_page))
    return int(str(page.group(1)))  


def get_area(infor):    #房屋面积信息
    area = re.match('^(.*?)\s\|\s(.*?)\s\|\s', infor)
    return area.group(2)

def get_follow(atten):  #房屋关注度信息
    follow = re.match('^(\d+)人关注', atten)
    return follow.group(1)

def get_day(atten):     #发布时间
    day = re.match('.*?关注\s/\s(.*?)$', atten)
    return day.group(1)

def get_unitprice(up):  #单价
    p = re.match('^(\d+),(\d+)元/平$', up)
    return int(str(p.group(1))+str(p.group(2)))

def get_model(infor):   #户型信息
    model = re.match('^(.*?)\s', infor)
    return model.group(1)

def analyse_house(data):
    val = BeautifulSoup(data, 'lxml')
    # 解析出标题
    titles = val.find_all('a', {'target': '_blank', 'data-el': 'ershoufang'})[1::2]
    x_val = etree.HTML(data)
    pngs = x_val.xpath('//img[@class="lj-lazy"]/@data-original')  # 解析出图片链接
    prices = val.find_all(class_='totalPrice totalPrice2')  
    infors = x_val.xpath('//div[@class="houseInfo"]/text()')  
    attens = x_val.xpath('//div[@class="followInfo"]/text()')  
    ups = x_val.xpath('//div[@class="unitPrice"]/span/text()')  
    #二维列表用来储存第一列信息
    information = [[], [], [], [], [], [], [], []]  
    #对每一列追加值
    for title, p, infor, atten, png, up in zip(titles, prices, infors, attens, pngs, ups):
        information[0].append(title.string)             # 地址
        information[1].append(get_model(infor))         # 户型
        information[2].append(get_area(infor))          # 面积
        information[3].append(get_unitprice(up))        # 单价
        information[4].append(float(str(p.span.string)))# 总价
        information[5].append(int(get_follow(atten)))   # 关注度
        information[6].append(get_day(atten))           # 发布时间
        information[7].append(title['href'])            # 链接
        save_png(png, title)                            # 保存图片
    return information


def merge_data(all_data, information):
    for i in range(8):
        all_data[i] += information[i]
    return all_data

def get_max(all_data):
    max_follow = all_data[5].index(max(all_data[5]))
    return all_data[3][max_follow], all_data[4][max_follow]


def save_data(all_data):
    all_data = np.array(all_data)
    all_data = all_data.T
    name = ['地址', '户型', '面积', '单价', '总价', '关注度', '发布时间', '链接']
    all_data = np.insert(all_data, 0, name, axis=0)
    all_data = pd.DataFrame(all_data)
    all_data.to_csv('{}二手房数据.csv'.format(citys[city_ind]), index=False)
#保存二手房数据到桌面


def save_png(png_url, title):
    name = str(title.string)
    if '*' in name:
        name = name.replace('*', '')
    if '\\' in name:
        name = name.replace('\\', '')
    if '/' in name:
        name = name.replace('/', '')
    if ':' in name:
        name = name.replace(':', '')
    if '?' in name:
        name = name.replace('?', '')
    if '"' in name:
        name = name.replace('"', '')
    if '<' in name:
        name = name.replace('<', '')
    if '>' in name:
        name = name.replace('>', '')
    if '|' in name:
        name = name.replace('|', '')
    if not os.path.exists('{}png'.format(citys[city_ind])):
        os.makedirs('{}png'.format(citys[city_ind]))
    else:
        try:
            png_data = requests.get(png_url, headers).content
            with open('{}png/'.format(citys[city_ind]) + name + '.jpg', 'wb') as f:
                f.write(png_data)
        except:
            print('爬取图片错误，错误位置：', name)

    
    # 主函数
if __name__ == '__main__':

    city_url = 'https://www.lianjia.com/city/'
    headers = {  
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
    }
    city_data = get_data(city_url, headers)
    provinces = analyse_province(city_data)
    index = (20)  
    citys, urls = analyse_city(city_data, index)
    city_ind = (1)  
    all_data = [[], [], [], [], [], [], [], []]
    page = get_page(get_data(urls[city_ind].format(1), headers))
    for pn in range(1, page):
        href = urls[city_ind].format(pn)
        print('正在爬取第{}页'.format(pn))
        two_hand_data = get_data(href, headers)
        information = analyse_house(two_hand_data)
        all_data = merge_data(all_data, information)
    max_follow_unitprice, max_follow_price = get_max(all_data)
    print('关注度最高的单价：', max_follow_unitprice, '元/平')
    print('关注度最高的房屋总价：', max_follow_price, '万元')
    print('数据量：', len(all_data[0]))
    save_data(all_data)


正在爬取第1页
