# AIRBNB 评论爬虫

## 思路：

1. 获取AIRBNB房源的基本信息，特别是listing_id（房源唯一编号）, reviews_count（这个房源一共有多少评论）


2. 通过listing_id构造url


3. 通过reviews_count实现翻页

In [None]:
import requests
import pandas as pd
import json
import math
import time
import random
import numpy as np

## 读取dataset

In [3]:
df = pd.read_csv('chengdu_listing_complete.csv')
df.shape

(12532, 10)

In [4]:
df.head(2)

Unnamed: 0,listing_id,latitude,longitude,neigh,accomdates,price,rate_type,reviews_count,star_rating,create_time
0,36195217,30.67412,103.8193,中国四川省成都市涌泉,2,159.0,nightly,5,5.0,2019-08-05 13:27:00
1,23616507,30.67425,103.81968,中国四川省成都市涌泉,2,139.0,nightly,27,4.89,2019-08-05 13:27:00


## 爬虫结果存储器

In [None]:
listing_id = [] #储存listing_id，每个房源唯一的编号
comment_id =[] #储存comment_id，每条评论唯一的编号
rating = [] #该房客给的评分，范围为0~5
comment = [] #评论文本
created_at = [] #评论产生时间

## 爬虫

In [None]:
def myRequestGet(url,num_retries=5):
    try:
        html = requests.get(url,timeout=8)
    except Exception as e :
        print('出错重试 {0}'.format(e))
        response = None
        if num_retries > 0:
            return myRequestGet(url, num_retries-1)
    return html

In [None]:
id_list = np.array(df['listing_id'][:])
reviews_count = np.array(df['reviews_count'][:])

head = 0 #爬虫停止时，可以修改head, tail值，重新启动爬虫，输错不影响结果
tail = len(id_list)

for i in range(head,len(id_list)):
    my_id = id_list[i]
    max_page = math.ceil(reviews_count[i]/100)  #该房源的评论一共有多少页
    url = 'https://zh.airbnb.com/api/v2/reviews?key=d306zoyjsyarp7ifhu67rjxn52tv0t20&currency=CNY&locale=zh&listing_id={}'.format(my_id)
    for k in range(0,  max_page):
        URL = url + '&role=guest&_format=for_p3&_limit=100&_offset=' + str(k * 100) + '&_order=language_country'
        response = myRequestGet(URL)
        if response.status_code == 200:
            answer = response.json()
            buffer = answer['reviews']
            for m in range(0, 100):
                try:  #储存结果至储存器中，注意用的是append，反复爬取不影响结果
                    comment.append(buffer[m]['comments'])
                    comment_id.append(buffer[m]['id'])
                    rating.append(buffer[m]['rating'])
                    created_at.append(buffer[m]['created_at'])
                    listing_id.append(my_id)
                except IndexError as e:
                    continue
        print('id:',my_id,' ', 'num of reviews in total:',reviews_count[i],'max_page:',max_page, 'the {} page of id {} is OK!'.format(k+1,i))
    review = pd.DataFrame({'listing_id':listing_id, 'comment_id': comment_id, 
                        'rating': rating, 'comment':comment, 'created_at': created_at})
    #每爬完一个房子的评论，写入一次csv
    review.to_csv('chengdu_reviews.csv', index = False, encoding = 'utf-8_sig')
    print('进度：', ((i+1)/df.shape[0])*100,"%")
    time.sleep(random.randint(0,2))  #爬虫休息时间

## 评论去重

In [21]:
review.shape

(202961, 5)

In [22]:
review_new = review.drop_duplicates(subset='comment_id')
review_new.shape

(202829, 5)