# Chapter 1 - Introduction: Motivating Hypothetical - DataSciencester

In [37]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [2]:
users = [
    {'id':0, 'name': 'Hero'},
    {'id':1, 'name': 'Dunn'}, 
    {'id':2, 'name': 'Sue'}, 
    {'id':3, 'name': 'Chi'},
    {'id':4, 'name': 'Thor'},
    {'id':5, 'name': 'Clive'},
    {'id':6, 'name': 'Hicks'},
    {'id':7, 'name': 'Devin'},
    {'id':8, 'name': 'Kate'},
    {'id':9, 'name': 'Klein'}
]

In [3]:
friendships = [
    (0,1), (0,2), (1,2), (1,3), (2,3), (3,4), (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)
]

We want to add a list of friends to each user.

In [4]:
for user in users:
    user['friends'] = []

In [5]:
for i, j in friendships:
    users[i]['friends'].append(users[j]) # user 0 'friends' append user 1
    users[j]['friends'].append(users[i]) # user 1 'friends' append user 0

What's the average number of connections for all the users?
- count the number of connections for each user
- store the count for each user
- average the total counts for each user

In [6]:
# define a function to count the number of connections for a given user

def number_of_friends (user):
    return len(user['friends'])

In [7]:
# count how many connections for all users
total_connections = np.sum(number_of_friends(user) for user in users)

In [8]:
total_connections

24

In [9]:
# to find the average, divide total_connections by the number of total users
total_connections/len(users)

2.4

Find the most connected people - who has the largest number of friends?
- for each user, count the number of friends
- store that in a tuple (user id, number_of_friends)
- sort by number_of_friends in descending order

In [10]:
num_friends_by_id = [(user['id'], number_of_friends(user)) for user in users]
num_friends_by_id

[(0, 2),
 (1, 3),
 (2, 3),
 (3, 3),
 (4, 2),
 (5, 3),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 1)]

In [11]:
sorted(num_friends_by_id, key=lambda x: x[1], reverse=True)

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

### Data Scientists You May Know
- Consider suggesting friends of immediate friends (e.g. recommend friends that are 2 degrees removed from the user)
- For a given user...
- Iterate over the user's list of friends
- For each friend in that list of friends...
- Grab that friend's list of friends
- Return the set of each friend's friends (to remove duplication)

In [12]:
# pseudocode

# foaf_ids = []

# for user in users:
#     for friend in user['friend']:
#         for foaf in users[friend]['friends']
#             return foaf['id']

In [13]:
def friend_of_a_friend (user):
    '''
    foaf stands for friend of a friend
    '''
    foaf = [foaf['id'] for friends in user['friends'] for foaf in friends['friends']]    
    return foaf

In [14]:
friend_of_a_friend(users[0])

[0, 2, 3, 0, 1, 3]

The issue with this function is the following:
1. it includes the user id that was passed in (e.g. 0)
2. it includes duplicate friends (3 appears twice)


Let's instead try to count how many mutual friends a user has with another user
- check that the users are in fact different (write a function for this)
- which friends are those that are not mutual (write a function for this)
- find each users friends of friends (write a function for this)


Note, we need to determine which friends are **not** mutual because we need to subtract that from the *foaf* function.

In [15]:
def not_same_user (user, other_user):
    return user['id'] != other_user['id']

In [16]:
not_same_user(users[0], users[1])

True

In [17]:
# all(iterable)
# Return True if all elements of the iterable are true (or if the iterable is empty). Equivalent to:

# def all(iterable):
#     for element in iterable:
#         if not element:
#             return False
#     return True

In [18]:
def not_friends (user, other_user):
    return all(not_same_user(friend, other_user) for friend in user['friends'])

In [19]:
def my_not_friends (user, other_user):
    for friend in user['friends']:  # among the users listed friends
        if friend not in other_user['friends']:    # if the user's listed friends are NOT in other_user's listed friends
            return False    # the user and the other user are not friends 
    return True    # otherwise the user and other user are friends

In [20]:
not_friends(users[0], users[1])

False

In [21]:
my_not_friends(users[0], users[1])

False

In [22]:
not_friends(users[0], users[3])

True

In [23]:
my_not_friends(users[0], users[3])

True

In [24]:
def friends_of_friend(user):
    return Counter(foaf['id'] for friend in user['friends'] \
                   for foaf in friend['friends'] \
                   if not_same_user(user, foaf) and not_friends(user, foaf))

In [25]:
friends_of_friend(users[3])

Counter({0: 2, 5: 1})

You might enjoy meeting users with similar interests as a data scientists.
Below is a list of (user_id, interests):

In [26]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

Write a function that finds users with the same interests:

In [28]:
def my_data_scientists_who_like (target_interest):
    user_id_list = []
    for user, interest in interests:
        if interest != target_interest:
            pass
        else:
            user_id_list.append(user)
            
    return list(set(user_id_list))

In [34]:
my_data_scientists_who_like('probability')

[3, 6]

In [35]:
def data_scientists_who_like(target_interest):
    return [user_id for user_id, interest in interests if interest == target_interest]

In [36]:
data_scientists_who_like('probability')

[3, 6]

An alternative way of answering the **same** question "Find users with the same interests" is by using `defaultdict`

In [39]:
# keys are interests, values are the user_ids in the form of a list
user_ids_by_interest = defaultdict(list)

for user_id, user_interest in interests:
    user_ids_by_interest[user_interest].append(user_id)

In [40]:
user_ids_by_interest

defaultdict(list,
            {'Hadoop': [0, 9],
             'Big Data': [0, 8, 9],
             'HBase': [0, 1],
             'Java': [0, 5, 9],
             'Spark': [0],
             'Storm': [0],
             'Cassandra': [0, 1],
             'NoSQL': [1],
             'MongoDB': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'numpy': [2],
             'statsmodels': [2],
             'pandas': [2],
             'R': [3, 5],
             'statistics': [3, 6],
             'regression': [3, 4],
             'probability': [3, 6],
             'machine learning': [4, 7],
             'decision trees': [4],
             'libsvm': [4],
             'C++': [5],
             'Haskell': [5],
             'programming languages': [5],
             'mathematics': [6],
             'theory': [6],
             'Mahout': [7],
             'neural networks': [7, 8],
             'deep learning': 

In [58]:
user_ids_by_interest['neural networks']

[7, 8]

In [41]:
# alternatively, we can make the keys the user_ids and the values the interests
interests_by_user_id = defaultdict(list)

for user_id, user_interest in interests:
    interests_by_user_id[user_id].append(user_interest)

In [42]:
interests_by_user_id

defaultdict(list,
            {0: ['Hadoop',
              'Big Data',
              'HBase',
              'Java',
              'Spark',
              'Storm',
              'Cassandra'],
             1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
             2: ['Python',
              'scikit-learn',
              'scipy',
              'numpy',
              'statsmodels',
              'pandas'],
             3: ['R', 'Python', 'statistics', 'regression', 'probability'],
             4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
             5: ['Python',
              'R',
              'Java',
              'C++',
              'Haskell',
              'programming languages'],
             6: ['statistics', 'probability', 'mathematics', 'theory'],
             7: ['machine learning',
              'scikit-learn',
              'Mahout',
              'neural networks'],
             8: ['neural networks',
              'deep learning',
       

In [78]:
interests_by_user_id[users[0]['id']]

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']

Now find **who** has the most interests in common with a given user

In [88]:
def most_common_interest_with (user):
    return Counter(interested_user_id \
                   for interest in interests_by_user_id[user['id']] \
                   for interested_user_id in user_ids_by_interest[interest]\
                   if interested_user_id != user['id']
    )

In [89]:
most_common_interest_with(users[0])

Counter({9: 3, 8: 1, 1: 2, 5: 1})

### Salaries & Experience


In [90]:
salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1),
    (48000, 0.7), (76000, 6),
    (69000, 6.5), (76000, 7.5),
    (60000, 2.5), (83000, 10),
    (48000, 1.9), (63000, 4.2)
]

- Examine the average salary for each tenure.
- Create a dictionary with the keys as the years, values are the listed salaries for each tenure.
- code for the alternative also where keys are the salary, and the values are the list of tenures.

In [91]:
salaries_by_tenure = defaultdict(list)

for salary, ten in salaries_and_tenures:
    salaries_by_tenure[ten].append(salary)
    
salaries_by_tenure

defaultdict(list,
            {8.7: [83000],
             8.1: [88000],
             0.7: [48000],
             6: [76000],
             6.5: [69000],
             7.5: [76000],
             2.5: [60000],
             10: [83000],
             1.9: [48000],
             4.2: [63000]})

In [92]:
tenures_by_salary = defaultdict(list)

for salary, ten in salaries_and_tenures:
    tenures_by_salary[salary].append(ten)
    
tenures_by_salary

defaultdict(list,
            {83000: [8.7, 10],
             88000: [8.1],
             48000: [0.7, 1.9],
             76000: [6, 7.5],
             69000: [6.5],
             60000: [2.5],
             63000: [4.2]})

- Let's create buckets for the tenures, since there are so many unique tenures:
- Then group together the salaries corresponding to each bucket
- and finally, compute the average salary for each group

In [94]:
def my_tenure_buckets (tenure):
    if tenure < 2.0:
        return "less than two years"
    elif tenure <= 5:
        return "between two and five years"
    else:
        return "greater than five years"

In [97]:
salaries_by_tenure.keys()

dict_keys([8.7, 8.1, 0.7, 6, 6.5, 7.5, 2.5, 10, 1.9, 4.2])

In [99]:
for tenure in salaries_by_tenure.keys():
    print(my_tenure_buckets(tenure))

greater than five years
greater than five years
less than two years
greater than five years
greater than five years
greater than five years
between two and five years
greater than five years
less than two years
between two and five years


In [107]:
salaries_by_tenure_bucket = defaultdict(list)

for tenure, salary in salaries_by_tenure.items():
    salaries_by_tenure_bucket[my_tenure_buckets(tenure)].append(salary[0])
    
salaries_by_tenure_bucket

defaultdict(list,
            {'greater than five years': [83000,
              88000,
              76000,
              69000,
              76000,
              83000],
             'less than two years': [48000, 48000],
             'between two and five years': [60000, 63000]})

In [108]:
avg_salary_by_tenure_bucket = defaultdict(float)

for bucket, salary in salaries_by_tenure_bucket.items():
    avg_salary_by_tenure_bucket[bucket] = np.mean(salaries_by_tenure_bucket[bucket])
    
avg_salary_by_tenure_bucket

defaultdict(float,
            {'greater than five years': 79166.666666666672,
             'less than two years': 48000.0,
             'between two and five years': 61500.0})

### Paid Accounts