In [None]:
import scrapy
from scrapy.http import HtmlResponse
from instaparser.items import InstaparserItem
import re
import json
from urllib.parse import urlencode
from copy import deepcopy


class InstagramSpider(scrapy.Spider):

    name = 'instagram'
    allowed_domains = ['instagram.com']
    start_urls = ['http://instagram.com/']

    insta_login_link = 'https://www.instagram.com/accounts/login/ajax/'
    graphql_url = 'https://www.instagram.com/graphql/query/'
    followers_hash = 'c76146de99bb02f6415203be841dd25a'
    followings_hash = 'd04b0a864b4b54837c0d870b0e77e076'

    insta_login = 'логин'
    insta_psw = 'пароль'
    target_accounts = ['ai_machine_learning', 'omgames_blog']  # список (list) целевых аккаунтов, которые будем парсить

    def parse(self, response: HtmlResponse):
            csrf_token = self.fetch_csrf_token(response.text)  # получаем csrf
            yield scrapy.FormRequest(                         #авторизуемся
                url=self.insta_login_link,
                method='POST',
                callback=self.login,
                formdata={'username': self.insta_login, 'enc_password': self.insta_psw},
                headers={'X-CSRFToken': csrf_token}
            )

    def login(self, response: HtmlResponse):
        j_body = json.loads(response.text)
        if j_body['authenticated']:
            for account in self.target_accounts:    #переходим на каждую страницу целевых аккаунтов для парсинга
                yield response.follow(
                    f'/{account}',
                    callback=self.target_user_parse,
                    cb_kwargs={'target_username': account}
                )

    def target_user_parse(self, response: HtmlResponse, target_username):
        target_user_id = self.fetch_user_id(response.text, target_username)
        variables = {'id': target_user_id,
                     'first': 24
                     }

        url_followers = f'{self.graphql_url}?query_hash={self.followers_hash}&{urlencode(variables)}'
        yield response.follow(
            url_followers,
            callback=self.users_parse,
            cb_kwargs={'target_username': target_username, 'flag': 'followers',  'variables': deepcopy(variables)}
        )

        url_followings = f'{self.graphql_url}?query_hash={self.followings_hash}&{urlencode(variables)}'
        yield response.follow(
            url_followings,
            callback=self.users_parse,
            cb_kwargs={'target_username': target_username, 'flag': 'followings', 'variables': deepcopy(variables)}
        )

    def users_parse(self, response: HtmlResponse, target_username, flag, variables):
        j_data = json.loads(response.text)
        type_field = 'edge_followed_by' if flag == 'followers' else 'edge_follow'
        page_info = j_data.get('data').get('user').get(type_field).get('page_info')
        if page_info['has_next_page']:
            variables['after'] = page_info['end_cursor']

            url = f"{response.url[:response.url.find('&')]}&{urlencode(variables)}"
            yield response.follow(
                url,
                callback=self.users_parse,
                cb_kwargs={'target_username': target_username, 'flag': flag, 'variables': deepcopy(variables)}
            )
            users = j_data.get('data').get('user').get(type_field).get('edges')
            for user in users:
                node = user.get('node')
                item = InstaparserItem(
                    _id=node.get('id'),
                    user_name=node.get('username'),
                    full_name=node.get('full_name'),
                    photo=node.get('profile_pic_url'),
                    insert_to_collection=f'{target_username}_{flag}'
                )
                yield item

    def fetch_csrf_token(self, text):
        matched = re.search('\"csrf_token\":\"\\w+\"', text).group()
        return matched.split(':').pop().replace(r'"', '')
    def fetch_user_id(self, text, username):
        matched = re.search(
            '{\"id\":\"\\d+\",\"username\":\"%s\"}' % username, text
        ).group()
        return json.loads(matched).get('id')

In [None]:
import scrapy

class InstaparserItem(scrapy.Item):
    # define the fields for your item here like:
    _id = scrapy.Field()
    user_name = scrapy.Field()
    full_name = scrapy.Field()
    photo = scrapy.Field()
    insert_to_collection = scrapy.Field()

In [None]:
from scrapy import signals

from itemadapter import is_item, ItemAdapter


class InstaparserSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class InstaparserDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

In [None]:
from scrapy.pipelines.images import ImagesPipeline
import scrapy
from pymongo import MongoClient


class InstagramPhotosPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        if item['photo']:
            try:
                yield scrapy.Request(item['photo'], meta=item)
            except Exception as e:
                print(e)

    def item_completed(self, results, item, info):
        if results:
            item['photo'] = results[0][1] if results[0][0] else None
        return item


class DataBasePipeline:

    def __init__(self):
        self.client = MongoClient('localhost', 27017)
        self.mongodb = self.client.instagram

    def __del__(self):
        self.client.close()

    def process_item(self, item, spider):

        collection = item['insert_to_collection']
        del item['insert_to_collection']

        if self.mongodb[collection].count_documents({'_id': item['_id']}) == 0:
            self.mongodb[collection].insert_one(item)

        return item

In [None]:
from pymongo import MongoClient

#  подписан пользователь
def get_followers(name:str):
    client = MongoClient('localhost', 27017)
    db = client.instagram
    profiles = db[name + '_followers']
    for profile in profiles:
        print(profile)

# список подписчиков указанного пользователя
def get_followings(name:str):
    client = MongoClient('localhost', 27017)
    db = client.instagram
    profiles = db[name + '_followings']
    for profile in profiles:
        print(profile)

In [None]:
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings


if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(InstagramSpider)
    process.start()

In [None]:
BOT_NAME = 'instaparser'

IMAGES_STORE = 'photo'

SPIDER_MODULES = ['instaparser.spiders']
NEWSPIDER_MODULE = 'instaparser.spiders'


USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'

ROBOTSTXT_OBEY = False

LOG_ENABLED = True
LOG_LEVEL = 'DEBUG'

CONCURRENT_REQUESTS = 16

DOWNLOAD_DELAY = 1.25

ITEM_PIPELINES = {
    'instaparser.pipelines.InstagramPhotosPipeline': 200,
    'instaparser.pipelines.DataBasePipeline': 300,
}