# 方法：用美麗湯直接抓取所有用戶訊息

In [1]:
from random import choice
import json
 
import requests
from bs4 import BeautifulSoup
 
_user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]
 
 
class InstagramScraper:
 
    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy
 
    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)
 
    def __request_url(self, url):
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                                 'https': self.proxy})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text
 
    @staticmethod
    def extract_json_data(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)
 
    def profile_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    elif value:
                        results[key] = value
        return results
 
    def profile_page_recent_posts(self, profile_url):
        results = []
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                if node and isinstance(node, dict):
                    results.append(node)
        return results

In [4]:
from pprint import pprint
 
k = InstagramScraper()
results = k.profile_page_recent_posts('https://www.instagram.com/yolayyc/?hl=en') #hl=en/zh makes no difference.
pprint(results)

[{'__typename': 'GraphImage',
  'accessibility_caption': 'Image may contain: 1 person, tree, sky and outdoor',
  'comments_disabled': False,
  'dimensions': {'height': 809, 'width': 1080},
  'display_url': 'https://instagram.ftpe8-2.fna.fbcdn.net/v/t51.2885-15/e35/s1080x1080/90332869_266937764299464_2341524270997098682_n.jpg?_nc_ht=instagram.ftpe8-2.fna.fbcdn.net&_nc_cat=100&_nc_ohc=Id8EGwmTTxMAX8oF9qg&oh=ced8cbeda41bca550e955fa9c4da8084&oe=5EA85C26',
  'edge_liked_by': {'count': 66},
  'edge_media_preview_like': {'count': 66},
  'edge_media_to_caption': {'edges': [{'node': {'text': '-\n'
                                                        '過不去的、過得去的，都會過去的。\n'
                                                        '\n'
                                                        '希望這個多事的2020明朗化，一切雨過天晴。🌸'}}]},
  'edge_media_to_comment': {'count': 3},
  'fact_check_information': None,
  'fact_check_overall_rating': None,
  'gating_info': None,
  'id': '2271180991376429248',
  'is_video':

In [None]:
# Goal: 我的論文資料目標為抓取IG公開帳戶的貼文文案以及圖片，做符號學分類。

# Murmur: 因為今年三月二十之前（沒錯，就是幾天前），IG API被關閉了（IG API可以抓取貼文文案），改成Graph API（只能抓到貼文圖片）。
# 因此，原先我打算硬來，用selenium模擬器抓文案，之後順便可以做instaBOT（有時間的話）。
# 但是使用selenium一直不成功，最後才用python美麗湯試成功了！資料形式也算漂亮整齊。

# 驚喜之處在於，ig竟自動加上一條metadata——accessibility_caption：針對圖片的自動辨識；
# 如第一張圖的caption是"Image may contain: 1 person, tree, sky and outdoor"，也許可以為之後做圖文intergration的研究者鋪路。
# 歡迎有興趣者，直接run這個code，很簡單喔:p