# 豆瓣用户友邻数据分析
## Part1 数据获取
<font color='red'>*若已有数据集可以跳过此部分，从Part2开始*</font>
### 获取目标用户友邻列表
- 爬取用户首页面友邻页面

  - 从settings中导入目标用户的`uid`
  - 查看豆瓣的友邻页面需要登陆账号，所以使用 cookie 模拟登录

In [1]:
import requests
from bs4 import BeautifulSoup

from settings import user_agent
from settings import cookie
from settings import target_user

session = requests.Session()

url = 'https://www.douban.com/people/' + target_user + '/rev_contacts'
headers = {
    'User-Agent': user_agent,
    'Cookie': cookie,
}

response = session.get(url=url, headers=headers)
if response.status_code != 200:
    print('获取失败，请检查cookie, uid')
soup = BeautifulSoup(response.text, 'lxml')
# print(soup)
print('获取成功')


获取成功


- 从爬取的页面中获取用户的友邻总数

In [2]:
# css 选择器
num = soup.select('#db-usr-profile > div.info > h1')
num = BeautifulSoup(str(num[0]), 'lxml').string
# 数字从倒数第二个字符开始
length = len(num) - 2
while '0' <= num[length] <= '9':
    length -= 1
num = int(num[length + 1:len(num) - 1])
print(num)


ValueError: invalid literal for int() with base 10: ''

- 爬取所有友邻列表页面，获取所有友邻的`uid`，保存至本地

In [None]:
from settings import uid_file

with open(uid_file, "w") as file:
    # 每页显示最多 70 个友邻
    for i in range(0, num, 70):
        current_url = url + '?start=' + str(i)
        # print(current_url)
        response = session.get(url=current_url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        peoples = soup.select('#content > div > div.article > dl > dt > a')
        for people in peoples:
            uid = (BeautifulSoup(str(people), 'lxml').a['href'])[30:-1]
            print(uid)
            file.write(uid + '\n')


### 获取友邻用户数据
- 通过[https://m.douban.com/people/uid/subject_profile](https://m.douban.com/people/uid/subject_profile)获取账户信息
- 通过[https://m.douban.com/people/uid/movie_charts](https://m.douban.com/people/uid/movie_charts)获取观影信息

In [None]:
from json import loads


def get_movie_info(nuid=''):
    nurl = 'https://m.douban.com/rexxar/api/v2/user/' + nuid + '/collection_stats?type=movie&for_mobile=1&ck=5Kvd'
    nreferer = 'https://m.douban.com/people/' + nuid + '/movie_charts'
    nheaders = {
        'Referer': nreferer,
        'User-Agent': user_agent,
    }
    nresponse = session.get(url=nurl, headers=nheaders)
    # 返回的数据为 json 格式，使用 loads 解析
    ndecoded = loads(nresponse.text)
    # print(decoded)
    nrow = []
    # 观影数
    try:
        nrow.append(ndecoded['total_collections'])
    except:
        nrow.append('')
    # 观看时间
    try:
        nrow.append(int(ndecoded['total_spent']))
    except:
        nrow.append('')
    # 消费
    try:
        nrow.append(int(ndecoded['total_cost']))
    except:
        nrow.append('')
    # 平均每周观看时间
    try:
        nrow.append(round(ndecoded['weekly_avg'], 1))
    except:
        nrow.append('')
    # 以下两项为最常观看地区
    try:
        nrow.append(ndecoded['countries'][0]['name'])
    except:
        nrow.append('')
    try:
        nrow.append(ndecoded['countries'][1]['name'])
    except:
        nrow.append('')
    # 以下三项为最常观看类型
    try:
        nrow.append(ndecoded['genres'][0]['name'])
    except:
        nrow.append('')
    try:
        nrow.append(ndecoded['genres'][1]['name'])
    except:
        nrow.append('')
    try:
        nrow.append(ndecoded['genres'][2]['name'])
    except:
        nrow.append('')
    # print(row)
    return nrow


def get_user_info(nuid=''):
    nurl = 'https://m.douban.com/rexxar/api/v2/user/' + nuid + '/archives_summary?for_mobile=1&ck=5Kvd'
    nreferer = 'https://m.douban.com/people/' + nuid + '/subject_profile'
    nheaders = {
        'Referer': nreferer,
        'User-Agent': user_agent,
    }
    nresponse = session.get(url=nurl, headers=nheaders)
    # 返回的数据为 json 格式，使用 loads 解析
    ndecoded = loads(nresponse.text)
    # print(decoded)
    nrow = []
    # 用户所在地区
    try:
        nrow.append(ndecoded['user']['loc']['name'])
    except:
        nrow.append('')
    # 用户广播数
    try:
        nrow.append(ndecoded['user']['statuses_count'])
    except:
        nrow.append('')
    # 用户注册时间
    try:
        nrow.append(ndecoded['user']['reg_time'][:4])
    except:
        nrow.append('')
    # 用户性别
    try:
        nrow.append(ndecoded['user']['gender'])
    except:
        nrow.append('')
    # print(row)
    return nrow


def get_info(nuid=''):
    nrow = []
    nrow += get_user_info(nuid)
    nrow += get_movie_info(nuid)
    print(nrow)
    return nrow


- 读取本地`uid`文件，获取所有友邻数据，写入数据集

In [None]:
import csv
from time import sleep

from settings import csv_title
from settings import dataset_file

with open(uid_file) as infile:
    with open(dataset_file, 'w', encoding='utf-8', newline='') as outfile:
        csv_file = csv.writer(outfile, dialect='excel')
        csv_file.writerow(csv_title)
        for line in infile:
            uid = line[:-1]
            # print(uid)
            csv_file.writerow(get_info(uid))
            sleep(2)
