In [1]:
import pandas as pd
import numpy as np

import requests
import json
import time
from tqdm import tqdm

# [Введение в анализ социальных сетей на примере VK API](https://habr.com/ru/post/263313/)

Risks:
- Account might be private (in this case, parsing is impossible).
- Several accounts of the same person.
- Fake accounts.

# Data Parsing

In [2]:
params = json.load(open('params.json'))

# Parameters for Parsing
source_id = params['source_id']
access_token = params['access_token']
version = '5.131'

In [3]:
def get_friends_of_source_id(source_id, access_token=access_token, version=version, collect_data=True):
    
    # Getting data
    url = f'https://api.vk.com/method/friends.get?user_id={source_id}&access_token={access_token}&fields=nickname%2C%20domain%2C%20sex%2C%20bdate%2C%20city%2C%20country%2C%20timezone%2C%20photo_50%2C%20photo_100%2C%20photo_200_orig%2C%20has_mobile%2C%20contacts%2C%20education%2C%20online%2C%20relation%2C%20last_seen%2C%20status%2C%20can_write_private_message%2C%20can_see_all_posts%2C%20can_post%2C%20universities&v={version}'
    session = requests.Session()
    response = session.get(url=url)
    session.close()
    
    # Checking response
    if response:
        data = json.loads(response.text)
        if 'error' in data.keys():
            return {source_id:'profile is private'}
    else:
        return {source_id:'no response'}
    
    # Collecting data
    columns = ['source_id','id','first_name','last_name','sex','bdate','country','city','mobile_phone']
    df = pd.DataFrame(columns=columns)
    if collect_data:
        df = pd.DataFrame(data['response']['items'])
        df['source_id'] = source_id
        df = df[columns]
        ## Cleaning data
        ### Column "sex"
        df['sex'] = df.sex.map({1:'F',2:'M'})
        ### Columns "country" & "city"
        def clean(x):
            try:
                return x['title']
            except:
                return x
        df['country'] = df.country.apply(clean)
        df['city'] = df.city.apply(clean)
        
    return df

In [4]:
# Creating a list of source ids
df_my_friends = get_friends_of_source_id(source_id=source_id); time.sleep(1)
list_source_ids = df_my_friends.id.to_list()

# Creating DataFrame of people
df_friends = df_my_friends.copy()

# Parsing
good_request, bad_request = 0, 0
problem_ids = []
for source_id in tqdm(list_source_ids):
    response = get_friends_of_source_id(source_id=source_id)
    if type(response) == type(df_friends):    
        df_friends = df_friends.append(response)
        good_request += 1
        time.sleep(1)
    else:
        problem_ids.append(response)
        bad_request += 1
        time.sleep(1)

# Showing statistics
print('good_request counts:', good_request)
print('bad_request counts:', bad_request)
        
# Saving results
df_friends.to_csv('vk.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 173/173 [04:17<00:00,  1.49s/it]

good_request counts: 113
bad_request counts: 60





In [5]:
problem_ids

[{40945093: 'profile is private'},
 {55454130: 'profile is private'},
 {113358110: 'profile is private'},
 {135378262: 'profile is private'},
 {145848546: 'profile is private'},
 {150078667: 'profile is private'},
 {153665352: 'profile is private'},
 {154826490: 'profile is private'},
 {158862836: 'profile is private'},
 {166845403: 'profile is private'},
 {167199106: 'profile is private'},
 {168811919: 'profile is private'},
 {169987301: 'profile is private'},
 {174779041: 'profile is private'},
 {175378845: 'profile is private'},
 {176496919: 'profile is private'},
 {178566794: 'profile is private'},
 {180193399: 'profile is private'},
 {180203136: 'profile is private'},
 {181202978: 'profile is private'},
 {183780294: 'profile is private'},
 {183898980: 'profile is private'},
 {185887345: 'profile is private'},
 {186969528: 'profile is private'},
 {189515792: 'profile is private'},
 {194223839: 'profile is private'},
 {196771639: 'profile is private'},
 {200528828: 'profile is priva