In [45]:
import os 
import re 
import requests 
import pandas as pd 
import datetime 
import json
from tqdm import tqdm

# 配置表头

在请求网页爬取的时候，输出的text信息中会出现抱歉，无法访问等字眼，这就是禁止爬取，需要通过反爬机制去解决这个问题。

headers是解决requests请求反爬的方法之一，相当于我们进去这个网页的服务器本身，假装自己本身在爬取数据。

对反爬虫网页，可以设置一些headers信息，模拟成浏览器取访问网站 。

In [2]:
# 请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
}

# 通过指定地名爬取数据

## 获取地点对应的containerid

In [46]:
def get_containerid(v_loc):
    """
    :param v_loc: 地点
    :return: containerid
    """
    url = 'https://m.weibo.cn/api/container/getIndex'
    # 请求参数
    params = {
        "containerid": "100103type=92&q={}&t=".format(v_loc),
        "page_type": "searchall",
    }
    r = requests.get(url, headers=headers, params=params)
    cards = r.json()["data"]["cards"]
    scheme = cards[0]['card_group'][0]['scheme']  # 取第一个
    containerid = re.findall(r'containerid=(.*?)&', scheme)[0]
    print('[{}]对应的containerid是：{}'.format(v_loc, containerid))
    return containerid
get_containerid('东城区')

[东城区]对应的containerid是：100808346303016f6ab9bf392b922011a70464_-_lbs


'100808346303016f6ab9bf392b922011a70464_-_lbs'

## 功能函数

In [28]:
def trans_time(v_str):
    """转换GMT时间为标准格式"""
    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
    timeArray = datetime.datetime.strptime(v_str, GMT_FORMAT)
    ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
    return ret_time

In [29]:
def data_clean(text):
    # 删除表情符号
    text = re.sub(r'<img.*?>', '', text)
    # 删除URL链接
    text = re.sub(r'<a.*?>|</a>', '', text)
    # 删除地址信息
    text = re.sub(r'<span class=\'url-icon\'>.*?</span><span class="surl-text">(.*?)</span>', '', text)
    # 只保留中文信息
    text = re.sub(r'[^\u4e00-\u9fa5]+', '', text)
    return text

## 爬取微博内容

In [47]:
# 请求地址
url = 'https://m.weibo.cn/api/container/getIndex'
# 请求参数
v_keyword = '东城区'
result_text = []#文本
result_time = []#时间
comments_count = []#评论数量
attitudes_count = []#点赞数量
containerid = get_containerid(v_keyword)
for n in tqdm(range(2,10)):
    params = {
        "containerid": containerid,
        "luicode": "10000011",
        "lcardid": "frompoi",
        "extparam": "frompoi",
        "lfid": "100103type=1&q={}".format(v_keyword),
        "since_id": n}
    r = requests.get(url, headers=headers, params=params) 
    data = r.json()["data"]
    for i in data['cards'][0]['card_group']:
        text = i['mblog']['text']
        if "抱歉，此微博已被删除。查看帮助：" in text:
            continue
        result_text.append(data_clean(text))
        result_time.append(trans_time(i['mblog']['created_at']))
        comments_count.append(i['mblog']['comments_count'])
        attitudes_count.append(i['mblog']['attitudes_count'])

[东城区]对应的containerid是：100808346303016f6ab9bf392b922011a70464_-_lbs


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.70it/s]


In [48]:
result={"result_text":result_text,"result_time":result_time,"comments_count":comments_count,"attitudes_count":attitudes_count}
data=pd.DataFrame(result)
data.to_csv(str(v_keyword)+'.csv')
data

Unnamed: 0,result_text,result_time,comments_count,attitudes_count
0,喝了一周药渣才发现提示,2023-04-20 11:29:06,4,1
1,驻京办变浙里办的一天感谢黑客酿造从杭州带来的青团和笋干喝过的啤酒数量来到颇具纪念意义的款,2023-04-20 16:26:21,2,1
2,,2023-04-16 10:56:52,0,1
3,江南赋,2023-04-20 00:32:44,0,0
4,红墙蓝天紫藤花孔庙国子监,2023-04-19 18:51:55,0,3
...,...,...,...,...
122,去北京一定要看天气没蓝天的照片效果真的一般,2023-04-20 18:46:26,0,0
123,终于不再是黑白灰了,2023-04-19 09:15:07,1,3
124,,2023-04-19 23:53:49,0,7
125,浅浅的体验了一把北京红衣服的阿姨自从知道我和她顺路之后就一直带着我走怕我找不到给我找座位甚至...,2023-04-20 19:46:40,0,0


# 通过地理坐标进行数据爬取

In [32]:
def get_containerid_by_coordinate(longitude,latitude,zoom=15):
    longitude = '{:.5f}'.format(float(longitude))
    latitude =  '{:.5f}'.format(float(latitude))
    return "2306570043_{}_{}_{}".format(str(longitude),str(latitude),str(zoom))

In [40]:
longitude = '116.45858'
latitude = '39.88772'
zoom = '13'
url='https://m.weibo.cn/api/container/getIndex?'
params = {'containerid':get_containerid_by_coordinate(longitude,latitude),'extparam':'map__',}
r = requests.get(url, headers=headers, params=params)
data = r.json()['data']

In [43]:
result_text = []#文本
result_time = []#时间
comments_count = []#评论数量
attitudes_count = []#点赞数量
for i in data['cards'][1]['card_group']:
        text = i['mblog']['text']
        if "抱歉，此微博已被删除。查看帮助：" in text:
            continue
        result_text.append(data_clean(text))
        result_time.append(trans_time(i['mblog']['created_at']))
        comments_count.append(i['mblog']['comments_count'])
        attitudes_count.append(i['mblog']['attitudes_count'])

In [44]:
result={"result_text":result_text,"result_time":result_time,"comments_count":comments_count,"attitudes_count":attitudes_count}
data=pd.DataFrame(result)
data.to_csv(str(v_keyword)+'.csv')
data

Unnamed: 0,result_text,result_time,comments_count,attitudes_count
0,没有什么比家人在一起更快乐的啦,2023-04-19 23:13:20,0,4
1,我的青春回来啦,2023-04-20 14:28:06,1,1
2,高温预警,2023-04-19 21:09:06,0,1
3,世界读书日读名著真的有用么小姐见影本期想说真的有用阴阳差错在灌篮高手的内地公映首日夜场补看了...,2023-04-19 23:34:08,3,2
4,我的意思是那一刻我好幸福,2023-04-20 15:29:52,2,0
5,十号线劲松劲松二区三家合住精装修看房电话同,2023-04-18 14:39:11,0,0
6,国贸合生汇附近号线双井地铁站号线九龙山地铁站旁边百环家园东区超大正规次卧电话同北京朝阳区租房...,2023-04-18 14:58:17,24,25
7,,2023-04-18 16:45:09,0,3
8,无力,2023-04-19 11:09:18,2,0
9,分享图片,2023-04-19 18:39:36,1,3
