In [34]:
users = [
{ "id": 0, "name": "Hero" },
{ "id": 1, "name": "Dunn" },
{ "id": 2, "name": "Sue" },
{ "id": 3, "name": "Chi" },
{ "id": 4, "name": "Thor" },
{ "id": 5, "name": "Clive" },
{ "id": 6, "name": "Hicks" },
{ "id": 7, "name": "Devin" },
{ "id": 8, "name": "Kate" },
{ "id": 9, "name": "Klein" }
]

In [35]:
friendships_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
                     (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [36]:
# Initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}

In [37]:
for i, j in friendships_pairs:
    friendships[i].append(j) 
    friendships[j].append(i) 

In [38]:
def number_of_friends(user):
    """How many friends does __user__ have?"""
    user_id = user["id"]
    friends_ids = friendships[user_id]
    return len(friends_ids)

total_connections = sum(number_of_friends(user)
for user in users)

In [39]:
print(number_of_friends(users[3]))

3


In [40]:
print(f'Total connections: {total_connections}')

Total connections: 24


In [41]:
num_users = len(users)
avg_connections = total_connections / num_users
print(f"Average connections: {avg_connections}")

Average connections: 2.4


In [42]:
num_friends_by_id = [(user["id"], number_of_friends(user)) 
    for user in users]

num_friends_by_id.sort(
    key=lambda id_and_friends: id_and_friends[1], reverse=True)

In [43]:
print('Each pair is (user_id, num_friends):')
num_friends_by_id

Each pair is (user_id, num_friends):


[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

In [44]:
def foaf_ids_bad(user):
# “foaf” is short for “friend of a friend”
    return [foaf_id
        for friend_id in friendships[user['id']]
        for foaf_id in friendships[friend_id]]

In [45]:
foaf_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [46]:
print(friendships[0])
print(friendships[1])
print(friendships[2])

[1, 2]
[0, 2, 3]
[0, 1, 3]


In [47]:
from collections import Counter

In [48]:
def friends_of_friends(user):
    user_id = user['id']
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]
        for foaf_id in friendships[friend_id]
        if foaf_id != user_id
        and foaf_id not in friendships[user_id]
    )

In [49]:
print(friends_of_friends(users[0]))
print(friends_of_friends(users[3]))

Counter({3: 2})
Counter({0: 2, 5: 1})


In [50]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [51]:
def data_scientistis_who_like(target_interest):
    """Find the ids of all users who like the target interest."""
    return [user_id
           for user_id, user_interest in interests
           if user_interest == target_interest]

In [52]:
print(data_scientistis_who_like("libsvm"))
print(data_scientistis_who_like("machine learning"))
print(data_scientistis_who_like("Big Data"))

[4]
[4, 7]
[0, 8, 9]


In [53]:
from collections import defaultdict

In [54]:
# Keys are interests, values are lists of user_idswith that interest
user_ids_by_interest = defaultdict(list)

In [55]:
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [56]:
# Keys are user_ids, values are list of interests for that user_id.
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

##### Find who has the most interests in common with a given user

In [57]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

In [59]:
print(most_common_interests_with(users[0]))
print(most_common_interests_with(users[1]))

Counter({9: 3, 1: 2, 8: 1, 5: 1})
Counter({0: 2})


## Salaries and experience

In [61]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [62]:
# Keys are years, values are lists of the salaries for each tenures.
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

In [63]:
# Keys are years, each value is average salary for that tenure.
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [69]:
average_salary_by_tenure

{8.7: 83000.0,
 8.1: 88000.0,
 0.7: 48000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 2.5: 60000.0,
 10: 83000.0,
 1.9: 48000.0,
 4.2: 63000.0}

In [66]:
from collections import OrderedDict
dict_average_salary_by_tenure = OrderedDict(sorted(average_salary_by_tenure.items()))
print(dict_average_salary_by_tenure)

OrderedDict([(0.7, 48000.0), (1.9, 48000.0), (2.5, 60000.0), (4.2, 63000.0), (6, 76000.0), (6.5, 69000.0), (7.5, 76000.0), (8.1, 88000.0), (8.7, 83000.0), (10, 83000.0)])


In [71]:
# Better solution
sorted(average_salary_by_tenure.items(), key=lambda x:x[0])

[(0.7, 48000.0),
 (1.9, 48000.0),
 (2.5, 60000.0),
 (4.2, 63000.0),
 (6, 76000.0),
 (6.5, 69000.0),
 (7.5, 76000.0),
 (8.1, 88000.0),
 (8.7, 83000.0),
 (10, 83000.0)]

## bucket the tenures

In [72]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [74]:
# Keys are tenure buckets, values are lists of salaries for that bucket.
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

Compute the average salary for each group

In [76]:
# Keys are tenure buckets, values are average salary for that bucket.
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

In [77]:
average_salary_by_bucket

{'more than five': 79166.66666666667,
 'less than two': 48000.0,
 'between two and five': 61500.0}

In [78]:
sorted(average_salary_by_bucket.items(), key=lambda x:x[0])

[('between two and five', 61500.0),
 ('less than two', 48000.0),
 ('more than five', 79166.66666666667)]