In [53]:
import numpy as np, pandas as pd
import sys
import scipy.stats as stats

## Finding Key Connectors

In [54]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" },
    { "id": 10, "name": "Jen" }
]

In [55]:
friendships = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4), (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)]

In [56]:
for user in users:
    user['friends'] = []

In [57]:
from pprint import pprint

In [58]:
pprint (users)

[{'friends': [], 'id': 0, 'name': 'Hero'},
 {'friends': [], 'id': 1, 'name': 'Dunn'},
 {'friends': [], 'id': 2, 'name': 'Sue'},
 {'friends': [], 'id': 3, 'name': 'Chi'},
 {'friends': [], 'id': 4, 'name': 'Thor'},
 {'friends': [], 'id': 5, 'name': 'Clive'},
 {'friends': [], 'id': 6, 'name': 'Hicks'},
 {'friends': [], 'id': 7, 'name': 'Devin'},
 {'friends': [], 'id': 8, 'name': 'Kate'},
 {'friends': [], 'id': 9, 'name': 'Klein'},
 {'friends': [], 'id': 10, 'name': 'Jen'}]


In [59]:
# populate the friendships data

for i, j in friendships:
    users[i]['friends'].append(users[j])
    users[j]['friends'].append(users[i])
    
# do it for users[i] AND for users[j] 

#### What is the Average Number of Connections per user

In [60]:
def number_of_friends (user):
    '''
    user is a single user in the users dictionary
    '''
    return len(user['friends'])

In [61]:
friend_count = []
for user in users:
    count = number_of_friends(user)
    friend_count.append(count)

In [62]:
np.mean(friend_count)//1

2.0

In [63]:
users

[{'id': 0,
  'name': 'Hero',
  'friends': [{'id': 1,
    'name': 'Dunn',
    'friends': [{...},
     {'id': 2,
      'name': 'Sue',
      'friends': [{...},
       {...},
       {'id': 3,
        'name': 'Chi',
        'friends': [{...},
         {...},
         {'id': 4,
          'name': 'Thor',
          'friends': [{...},
           {'id': 5,
            'name': 'Clive',
            'friends': [{...},
             {'id': 6,
              'name': 'Hicks',
              'friends': [{...},
               {'id': 8,
                'name': 'Kate',
                'friends': [{...},
                 {'id': 7, 'name': 'Devin', 'friends': [{...}, {...}]},
                 {'id': 9, 'name': 'Klein', 'friends': [{...}]}]}]},
             {'id': 7,
              'name': 'Devin',
              'friends': [{...},
               {'id': 8,
                'name': 'Kate',
                'friends': [{'id': 6,
                  'name': 'Hicks',
                  'friends': [{...}, {...}]},
        

#### Find the users with the most connecions

In [11]:
np.max(friend_count)

3

In [12]:
friend_count_per_user= []

for user in users:
    count = number_of_friends(user)
    friend_count_per_user.append((user['id'], count))

In [13]:
friend_count_per_user

[(0, 2),
 (1, 3),
 (2, 3),
 (3, 3),
 (4, 2),
 (5, 3),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 1),
 (10, 0)]

In [14]:
sorted(friend_count_per_user, key=lambda x: x[1], reverse=True)

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1),
 (10, 0)]

### Data Scientists You May Know
Build a suggester by looking at second degree connections

In [15]:
def find_friends_of_friends(user):
    return [foaf['id'] for friend in user['friends'] for foaf in friend['friends']]

In [16]:
find_friends_of_friends(users[0])
# this prints the list of friend id's that are 2 degrees removed for each friend of the user
# notice there are duplicate friends listed here

[0, 2, 3, 0, 1, 3]

In [17]:
# if I want to remove duplicates, I can then add set in front of the list comprehension.
def foaf_set(user):
    return set([foaf['id'] for friend in user['friends'] for foaf in friend['friends']])

In [18]:
foaf_set(users[0])

{0, 1, 2, 3}

**Complete a count of mutual friends**

In [19]:
from collections import Counter

In [20]:
# this will return a boolean to test if the users are the same or not
def not_the_same(user, other_user):
    return user['id'] != other_user['id']


In [21]:
not_the_same(users[0], users[1])

True

In [22]:
def not_friends(user, other_user):
    return all([not_the_same(friend, other_user) for friend in user['friends']])

In [23]:
not_friends(users[0], users[1])

False

In [24]:
def friend_of_friend(user):
    return Counter(foaf['id'] for friend in user['friends'] for foaf in friend['friends'] \
                   if not_the_same(user, foaf) and not_friends(user, foaf))

In [25]:
# this means that for User 3 (Chi) has 2 mutual friends with Hero (users[0]) and 1 mutual friend with Clive (users[5])
friend_of_friend(users[3])

Counter({0: 2, 5: 1})

#### Interests

In [26]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [27]:
interests[0]

(0, 'Hadoop')

In [28]:
len(interests)

49

** Write a function that finds users with a certain interest**

In [29]:
interests[0][0]

0

In [30]:
interests[0][1]

'Hadoop'

In [31]:
# test the unpacknig of the tuples into 2 values

len([print(user_id) for user_id, user_interest in interests])

0
0
0
0
0
0
0
1
1
1
1
1
2
2
2
2
2
2
3
3
3
3
3
4
4
4
4
5
5
5
5
5
5
6
6
6
6
7
7
7
7
8
8
8
8
9
9
9
9


49

In [32]:
def data_scientists_who_like(target_interest):
    return [user_id for user_id, user_interest in interests if user_interest == target_interest]

In [33]:
data_scientists_who_like('Hadoop')

[0, 9]

In [34]:
data_scientists_who_like('decision trees')

[4]

**This takes a long time since it goes through every interest for every user.**
**Build an index of users and interests into a dictionary** where the keys are the interests and the values is a list of user ids.

In [35]:
from collections import defaultdict

In [36]:
user_ids_by_interest = defaultdict(list)
# name the dictionary user_ids_by_interest
# call on defaultdict and tell it that the values should be of list format

In [37]:
user_ids_by_interest

defaultdict(list, {})

In [38]:
# go through each tuple and write it into the dictionary, where 

# dictionary_name[key_name].append() and I can call the .append() method because the value type is a list as I \
# specified

# then append the user_id that applies to that particular key

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [39]:
user_ids_by_interest

defaultdict(list,
            {'Hadoop': [0, 9],
             'Big Data': [0, 8, 9],
             'HBase': [0, 1],
             'Java': [0, 5, 9],
             'Spark': [0],
             'Storm': [0],
             'Cassandra': [0, 1],
             'NoSQL': [1],
             'MongoDB': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'numpy': [2],
             'statsmodels': [2],
             'pandas': [2],
             'R': [3, 5],
             'statistics': [3, 6],
             'regression': [3, 4],
             'probability': [3, 6],
             'machine learning': [4, 7],
             'decision trees': [4],
             'libsvm': [4],
             'C++': [5],
             'Haskell': [5],
             'programming languages': [5],
             'mathematics': [6],
             'theory': [6],
             'Mahout': [7],
             'neural networks': [7, 8],
             'deep learning': 

As an exercise, **invert the keys and the values** where the keys are the user_ids and the values are their list of interests

In [40]:
interests_by_userids = defaultdict(list)

In [41]:
for user_id, user_interest in interests:
    interests_by_userids[user_id].append(user_interest)

In [42]:
interests_by_userids

defaultdict(list,
            {0: ['Hadoop',
              'Big Data',
              'HBase',
              'Java',
              'Spark',
              'Storm',
              'Cassandra'],
             1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
             2: ['Python',
              'scikit-learn',
              'scipy',
              'numpy',
              'statsmodels',
              'pandas'],
             3: ['R', 'Python', 'statistics', 'regression', 'probability'],
             4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
             5: ['Python',
              'R',
              'Java',
              'C++',
              'Haskell',
              'programming languages'],
             6: ['statistics', 'probability', 'mathematics', 'theory'],
             7: ['machine learning',
              'scikit-learn',
              'Mahout',
              'neural networks'],
             8: ['neural networks',
              'deep learning',
       

**Now find who has the most interests in common with a given user.**
- Stipulate User
- For each interest a user has, get OTHER users that share the same interest
- Count how many times each of the OTHER users appear
- Return the user that appeared the MOST FREQUENT number of times

In [71]:
def most_common_interests_with (user):
    return Counter(interested_user_id for interest in interested_by_user_id[user['id']] \
                   for interested_user_id in user_ids_by_interest[interest] if interested_user_id != user["id"])
        

In [72]:
most_common_interests_with('0')

NameError: name 'interested_by_user_id' is not defined

In [67]:
len(users)

11

In [66]:
users[0]

{'id': 0,
 'name': 'Hero',
 'friends': [{'id': 1,
   'name': 'Dunn',
   'friends': [{...},
    {'id': 2,
     'name': 'Sue',
     'friends': [{...},
      {...},
      {'id': 3,
       'name': 'Chi',
       'friends': [{...},
        {...},
        {'id': 4,
         'name': 'Thor',
         'friends': [{...},
          {'id': 5,
           'name': 'Clive',
           'friends': [{...},
            {'id': 6,
             'name': 'Hicks',
             'friends': [{...},
              {'id': 8,
               'name': 'Kate',
               'friends': [{...},
                {'id': 7, 'name': 'Devin', 'friends': [{...}, {...}]},
                {'id': 9, 'name': 'Klein', 'friends': [{...}]}]}]},
            {'id': 7,
             'name': 'Devin',
             'friends': [{...},
              {'id': 8,
               'name': 'Kate',
               'friends': [{'id': 6,
                 'name': 'Hicks',
                 'friends': [{...}, {...}]},
                {...},
                {'id