# Finding Key Connectors

## Provided Data

In [1]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

friendship_pairs = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4), (4,5), (5,6),
                   (5,7), (6,8), (7,8), (8,9)]

## Preparing data

### Create a dict where keys are user ids and values are lists of friends ids

In [2]:
# Initialize the dict with an empy list for each user id:
friendships = {user["id"]: [] for user in users}

In [3]:
friendships

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}

In [4]:
# Loop over the friendship pairs to populate the dict
for i, j in friendship_pairs:
    friendships[i].append(j) # Add j as a friend of user i
    friendships[j].append(i) # Add i as a friend of user j

In [5]:
# Print the dict
friendships

{0: [1, 2],
 1: [0, 2, 3],
 2: [0, 1, 3],
 3: [1, 2, 4],
 4: [3, 5],
 5: [4, 6, 7],
 6: [5, 8],
 7: [5, 8],
 8: [6, 7, 9],
 9: [8]}

## What's the average number of connections?

### First, we find the total number of connections

In [6]:
# Create a function to find total number of connections
def number_of_friends(user):
    """How many friends does _user_ have"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    return len(friend_ids)

total_connections = sum(number_of_friends(user) for user in users)

In [7]:
print(total_connections)

24


### Then we divide by the number of users

In [8]:
num_users = len(users) # length of the users list
avg_connections = total_connections / num_users # 24 / 10 == 2.4
print(avg_connections)

2.4


In [17]:
# Create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
num_friends_by_id.sort(
        key=lambda id_and_friends: id_and_friends[1],
        reverse=True)
print(num_friends_by_id)

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


One way to think of what we've done is as a way of identifying people who are somehow central to the network. In fact, what we've just computed is the network metric **_degree centrality_**.

# Data Scientists You May Know
**Design a "Data Scientists You May Know" suggester**

In [33]:
# Define function to iterate over users friends and collect the friends' friends
def foaf_ids_bad(user):
    """foaf is short for "friend of a friend" """
    return [foaf_id for friend_id in friendships[user["id"]] for foaf_id in friendships[friend_id]]

List comprehension of:
```
foaf_id = []
for friend_id in friendships[user["id"]]:
    for i in friendships[friend_id]:
        foaf_id.append(i)
return foaf_id
```

In [29]:
foaf_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [34]:
print(friendships[0]) # Friends of User 0

[1, 2]


In [35]:
print(friendships[1]) # Friends of User 1

[0, 2, 3]


In [36]:
print(friendships[2]) # Friends of User 2

[0, 1, 3]


Knowing people are friends of friends in multiple ways seems like interesting information, so maybe instead we should produce a count of mutuall friends. And we should probably exclude people already known to the use. (i.e. don't include user 1 or 2 since they are already friends of user 0)

In [38]:
from collections import Counter
def friends_of_friends(user):
    user_id = user["id"]
    return Counter(foaf_id 
                   for friend_id in friendships[user_id] # for each of my friends,
                   for foaf_id in friendships[friend_id] # find their friends
                   if foaf_id != user_id # who aren't me
                   and foaf_id not in friendships[user_id] # and aren't my friends
                  )

In [39]:
print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


This correctly tells us Chi (id 3) that she has two mutual friends with Hero (id 0) but only one mutual friend with Clive (id 5).

Chi (id 3) is not currently connected with Hero (id 0) or Clive (id 5).

In [42]:
# Declare list of user_ids and their interests
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [43]:
# Build a function to find users with similar interests
def data_scientists_who_like(target_interest):
    """Find the ids of all the users who like the target interest."""
    return [user_id
           for user_id, user_interest in interests
           if user_interest == target_interest]

While this function works... if we had to examine the whole list of interests for every search when we have a larger number of users and interests, we're better off building an index from interests to users:

In [44]:
from collections import defaultdict

In [45]:
# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [46]:
# Keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

In [47]:
print(user_ids_by_interest)

defaultdict(<class 'list'>, {'Hadoop': [0, 9], 'Big Data': [0, 8, 9], 'HBase': [0, 1], 'Java': [0, 5, 9], 'Spark': [0], 'Storm': [0], 'Cassandra': [0, 1], 'NoSQL': [1], 'MongoDB': [1], 'Postgres': [1], 'Python': [2, 3, 5], 'scikit-learn': [2, 7], 'scipy': [2], 'numpy': [2], 'statsmodels': [2], 'pandas': [2], 'R': [3, 5], 'statistics': [3, 6], 'regression': [3, 4], 'probability': [3, 6], 'machine learning': [4, 7], 'decision trees': [4], 'libsvm': [4], 'C++': [5], 'Haskell': [5], 'programming languages': [5], 'mathematics': [6], 'theory': [6], 'Mahout': [7], 'neural networks': [7, 8], 'deep learning': [8], 'artificial intelligence': [8], 'MapReduce': [9]})


In [48]:
print(interests_by_user_id)

defaultdict(<class 'list'>, {0: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'], 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'], 2: ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'], 3: ['R', 'Python', 'statistics', 'regression', 'probability'], 4: ['machine learning', 'regression', 'decision trees', 'libsvm'], 5: ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'], 6: ['statistics', 'probability', 'mathematics', 'theory'], 7: ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'], 8: ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'], 9: ['Hadoop', 'Java', 'MapReduce', 'Big Data']})


Now it's easy to find who has the most interests in common with a given user:
- Iterate over the user's interests
- For each interest, iterate over other users with that interest
- Keep count of how many times we see each other user

In [50]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id 
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

In [51]:
most_common_interests_with(users[0])

Counter({9: 3, 8: 1, 1: 2, 5: 1})

# Salaries and Experience
**Provide some fun facts about how much data scientists earn.**

In [52]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [53]:
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}
print(average_salary_by_tenure)

{8.7: 83000.0, 8.1: 88000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}


Turns out to not be very useful, since none of the users have the same tenure. So we're just reporting the individual users' salaries. It may be more helpful to bucket the tenures.

In [54]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

Then we group together the salaries corresponding to each bucket.

In [56]:
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
print(salary_by_tenure_bucket)

defaultdict(<class 'list'>, {'more than five': [83000, 88000, 76000, 69000, 76000, 83000], 'less than two': [48000, 48000], 'between two and five': [60000, 63000]})


Finally, we compute the average salary for each group:

In [57]:
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}
print(average_salary_by_bucket)

{'more than five': 79166.66666666667, 'less than two': 48000.0, 'between two and five': 61500.0}


Data scientists with more than five years experience earn 65% more than data scientists with little or no experience

# Topics of Interest
**What topics are users most interested in?**

A simple method is to count the words:
- Lowercase each interest (since users may or may not capitalize their interests)
- Split it into words
- Count the results

In [58]:
words_and_counts = Counter(word
                          for user, interest in interests
                          for word in interest.lower().split())
print(words_and_counts)

Counter({'big': 3, 'data': 3, 'java': 3, 'python': 3, 'learning': 3, 'hadoop': 2, 'hbase': 2, 'cassandra': 2, 'scikit-learn': 2, 'r': 2, 'statistics': 2, 'regression': 2, 'probability': 2, 'machine': 2, 'neural': 2, 'networks': 2, 'spark': 1, 'storm': 1, 'nosql': 1, 'mongodb': 1, 'postgres': 1, 'scipy': 1, 'numpy': 1, 'statsmodels': 1, 'pandas': 1, 'decision': 1, 'trees': 1, 'libsvm': 1, 'c++': 1, 'haskell': 1, 'programming': 1, 'languages': 1, 'mathematics': 1, 'theory': 1, 'mahout': 1, 'deep': 1, 'artificial': 1, 'intelligence': 1, 'mapreduce': 1})


This makes it easy to list out the words that occur more than once:

In [59]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
