In [1]:
%reset -f
import networkx as nx
import sqlite3
import os
import warnings
import json
import pandas as pd
import tweepy

warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
# Load Twitter credentials
api_keys = json.load(open('twitter-api-keys.local.json'))
bearer_token = api_keys["bearer_token"]
client = tweepy.Client(
    bearer_token=api_keys['bearer_token'], wait_on_rate_limit=True)

auth = tweepy.OAuthHandler(
    consumer_key=api_keys['api_key'],
    consumer_secret=api_keys['api_key_secret'],
    access_token=api_keys['access_token'],
    access_token_secret=api_keys['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)


In [3]:
# Create databases and tables
conn = sqlite3.connect('twitter.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS users
                (id INTEGER PRIMARY KEY NOT NULL,
                screen_name TEXT NOT NULL,
                followers_count INTEGER,
                following_count INTEGER,
                verified INTEGER,
                followers TEXT DEFALUT '',
                friends TEXT DEFALUT '',
                mutuals TEXT DEFALUT '')''')


<sqlite3.Cursor at 0x1f08913ddc0>

In [22]:
# # delete database
# c.execute('''DROP TABLE IF EXISTS paths''')
# conn.commit()

In [5]:
me = 'fcx_xm'
me_info = api.get_user(screen_name=me)
c.execute('''INSERT OR IGNORE INTO users
                (id, screen_name, followers_count, following_count, verified)
                VALUES (?, ?, ?, ?, ?)''',
                (me_info.id, me_info.screen_name, me_info.followers_count,
                me_info.friends_count, me_info.verified))
conn.commit()

def fetch(screen_name, type):
    if type == 'followers':
        f = c.execute('''SELECT followers FROM users WHERE screen_name = ?''', (screen_name,)).fetchone()[0]
    elif type == 'friends':
        f = c.execute('''SELECT friends FROM users WHERE screen_name = ?''', (screen_name,)).fetchone()[0]
    elif type == 'mutuals':
        f = c.execute('''SELECT mutuals FROM users WHERE screen_name = ?''', (screen_name,)).fetchone()[0]
    return json.loads(f) if f else None


In [17]:
# Create errors table with id, method, username, error
def save_error(method, username, error):
    if 'KeyboardInterrupt' in error:
        return
    c.execute('''CREATE TABLE IF NOT EXISTS errors
                (id INTEGER PRIMARY KEY NOT NULL,
                method TEXT NOT NULL,
                username TEXT NOT NULL,
                error TEXT NOT NULL,
                timestamp TEXT NOT NULL,
                fixed INTEGER DEFAULT 0)''')
    c.execute('''INSERT INTO errors
                (method, username, error, timestamp)
                VALUES (?, ?, ?, ?)''',
                (method, username, error, str(pd.Timestamp.now())))
    conn.commit()


def get_followers(screen_name, override):
    print('Getting followers')
    
    if fetch(screen_name, 'followers') != None and not override:
        print('- Already have friends for ' + screen_name)
        return 

    ids = []
    for fid in tweepy.Cursor(api.get_follower_ids, screen_name=screen_name, count=5000).items():
        ids.append(fid)
    
    info = []
    for i in range(0, len(ids), 100):
        try:
            chunk = ids[i:i+100]
            print('- Getting friends info for chunk ' + str(i) + ' to ' + str(i+100) + ' of ' + str(len(ids)) + ' for ' + screen_name)
            info.extend(api.lookup_users(user_id=chunk))
        except:
            import traceback
            traceback.print_exc()
            print('Something went wrong, skipping...')
            save_error('get_friends', screen_name, traceback.format_exc())

    followers_list = {}
    for i in info:
        followers_list[i.id] = i.screen_name
        c.execute('''INSERT OR IGNORE INTO users
                        (id, screen_name, followers_count, following_count, verified)
                        VALUES (?, ?, ?, ?, ?)''',
                        (i.id, i.screen_name, i.followers_count, i.friends_count, i.verified))
    c.execute('''UPDATE users SET followers = ? WHERE screen_name = ?''',
                (json.dumps(followers_list), screen_name))
    conn.commit()

def get_friends(screen_name, override):
    print('Getting friends')

    if fetch(screen_name, 'friends') != None and not override:
        print('- Already have friends for ' + screen_name)
        return 

    ids = []
    for fid in tweepy.Cursor(api.get_friend_ids, screen_name=screen_name, count=5000).items():
        ids.append(fid)
    
    info = []
    for i in range(0, len(ids), 100):
        try:
            chunk = ids[i:i+100]
            # print chunk and len of ids
            print('- Getting friends info for chunk ' + str(i) + ' to ' + str(i+100) + ' of ' + str(len(ids)) + ' for ' + screen_name)
            info.extend(api.lookup_users(user_id=chunk))
        except:
            import traceback
            traceback.print_exc()
            print('Something went wrong, skipping...')
            save_error('get_friends', screen_name, traceback.format_exc())

    friends_list = {}
    for i in info:
        friends_list[i.id] = i.screen_name
        c.execute('''INSERT OR IGNORE INTO users
                        (id, screen_name, followers_count, following_count, verified)
                        VALUES (?, ?, ?, ?, ?)''',
                        (i.id, i.screen_name, i.followers_count, i.friends_count, i.verified))
                        
    c.execute('''UPDATE users SET friends = ? WHERE screen_name = ?''',
                (json.dumps(friends_list), screen_name))
    conn.commit()

def get_mutuals(screen_name, override):
    get_followers(screen_name, override)
    followers = fetch(screen_name, 'followers')
    get_friends(screen_name, override)
    friends = fetch(screen_name, 'friends')
    # save in mutuals dict of id: screename those who are in both followers and friends
    mutuals = followers.keys() & friends.keys()
    mutuals = {k: friends[k] for k in mutuals}

    c.execute('''UPDATE users SET mutuals = ? WHERE screen_name = ?''',
                (json.dumps(mutuals), screen_name))
    conn.commit()

In [7]:
def build_mutuals_df(screen_name):
    mutuals_json = json.loads(c.execute('''SELECT mutuals FROM users WHERE screen_name = ?''', (screen_name,)).fetchone()[0])
    return pd.read_sql_query('''SELECT * FROM users WHERE id IN ({})'''.format(','.join(map(str, mutuals_json.keys()))), conn)


In [8]:
def get_filtered_users(mutuals_df, rules, exclude):
    mutuals_matching = []
    verified = []
    min_followers = []
    for index, row in mutuals_df.iterrows():
        if row['screen_name'] not in exclude and 'verified' in rules and row['verified'] == rules['verified']:
            verified.append(row['id'])
        # check min_followers
        if row['screen_name'] not in exclude and 'min_followers' in rules and row['followers_count'] >= rules['min_followers']:
            min_followers.append(row['id'])

    mutuals_matching = list(set(verified) & set(min_followers))
    mutuals_matching_df = mutuals_df.loc[mutuals_df['id'].isin(
        mutuals_matching)]
    return mutuals_matching_df

In [21]:
def save_path(path, rules):
    c.execute('''CREATE TABLE IF NOT EXISTS paths (
        id INTEGER PRIMARY KEY,
        path TEXT,
        rules TEXT,
        src TEXT,
        dst TEXT,
        length INTEGER
        )''')
    c.execute('''INSERT OR IGNORE INTO paths
                    (path, rules, src, dst, length)
                    VALUES (?, ?, ?, ?, ?)''',
                    (json.dumps(path), json.dumps(rules), path[0], path[-1], len(path)))
    conn.commit()

In [24]:
def shortest_path_to_celeb(exclude, override):
    print('Getting shortest path to celeb')
    path = []
    max_depth = 3
    rules = {'verified': True, 'min_followers': 1e5}
    found = False
    i = 0
    # Starting with me
    current_user = me
    path.append(current_user)
    while ((len(path) <= max_depth) and (not found)):
        i += 1
        print('----------------- Iteration ' + str(i) + ' -----------------')
        print('Current user: {}'.format(current_user))
        get_mutuals(current_user, override)
        mutuals_df = build_mutuals_df(current_user)

        mutuals_matching = get_filtered_users(mutuals_df, rules, exclude)

        # TODO: add backtracking when max_depth is reached and no match is found
        if len(mutuals_matching) == 0:
            print('No matching users found')
            mutuals_df = mutuals_df.loc[~mutuals_df['screen_name'].isin(
                exclude)]
            current_user = mutuals_df.sort_values(
                'followers_count', ascending=False).iloc[0]['screen_name']
            path.append(current_user)
        else:
            most_followers = mutuals_matching.sort_values(
                'followers_count', ascending=False).iloc[0]
            path.append(most_followers['screen_name'])

            print('Matching user with most followers: {}, with {} followers'.format(
                most_followers['screen_name'], most_followers['followers_count']))

            found = True

        print('Built path: {}'.format(path))

    save_path(path, rules)
    return path

shortest_path_to_celeb(['vodafone_es'], True)

Getting shortest path to celeb
----------------- Iteration 1 -----------------
Current user: fcx_xm
Getting followers
- Getting friends info for chunk 0 to 100 of 115 for fcx_xm
- Getting friends info for chunk 100 to 200 of 115 for fcx_xm
Getting friends
- Getting friends info for chunk 0 to 100 of 169 for fcx_xm
- Getting friends info for chunk 100 to 200 of 169 for fcx_xm
No matching users found
Built path: ['fcx_xm', 'Meescapo']
----------------- Iteration 2 -----------------
Current user: Meescapo
Getting followers
- Getting friends info for chunk 0 to 100 of 13103 for Meescapo
- Getting friends info for chunk 100 to 200 of 13103 for Meescapo
- Getting friends info for chunk 200 to 300 of 13103 for Meescapo
- Getting friends info for chunk 300 to 400 of 13103 for Meescapo
- Getting friends info for chunk 400 to 500 of 13103 for Meescapo
- Getting friends info for chunk 500 to 600 of 13103 for Meescapo
- Getting friends info for chunk 600 to 700 of 13103 for Meescapo
- Getting frie

['fcx_xm', 'Meescapo', 'Loulogio_Pi']

In [26]:
max_paths = 3
exclude_users = ['vodafone_es']
paths_from_user = c.execute('''SELECT * FROM paths WHERE src = ?''', (me,)).fetchall()
# get all elements from all paths from paths_from_user
n_paths = len(paths_from_user)

while n_paths < max_paths:
    print('----------------- Path # ' + str(n_paths) + ' -----------------')
    exclude_users += 
    new_path = shortest_path_to_celeb(exclude_users, False)



1
