# Preparation for Facebook Technical Questions

Here are my notes for preparing for the technical portion of the interview. To simulate interview conditions, I wrote all code in markdown mode only converting the cells to executable after I felt comfortable they were correct.

* [Mock Questions from E-mail](#email)
* [Leetcode Facebook Prep](#leetcode)

## Exercises <a id='exercises'></a>

### Mock Question from E-mail <a id='email'></a>

An attendance log for every student in a school district ```attendance_events```:

| date | student_id | attendance |
|:----:|:----------:|:----------:|
|      |            |            |

A summary table with demographics for each student in the district ```all_students```: 

|student_id | school_id | grade_level | date_of_birth | hometown |
|-----------|-----------|-------------|---------------|----------|

Using this data, you could answer questions like the following:

* What percent of students attend school on their birthday?
* Which grade level had the largest drop in attendance between yesterday and today?

In [214]:
################################################################################
################################################################################
# Functions used to generate mock data simulating the tables given above.      #
################################################################################
################################################################################

import pandas as pd
import numpy as np

n_students = 1000
n_days = 10
start_date = '2017-09-01'
end_date = '2018-06-15'

################################################################################
# Attendance Table                                                             #
################################################################################
def _make_attendance_dates(n_students, start_date, end_date):
    dr = pd.date_range(start_date, end_date)
    dates = []
    for day in dr:
        dates.extend([day] * n_students)
    return dates

def _make_student_ids(n_students, n_days=None):
    student_ids = [xx for xx in range(100, 100 + n_students)]
    if n_days is not None:
        student_ids = student_ids * n_days
    return student_ids

def _make_attendance(n_students, n_days):
    attendance = []
    for _ in range(0, n_days):
        prob = np.random.rand()
        this_attendance = list(np.random.choice(2, n_students, p=[prob, 1.-prob]))
        attendance.extend(this_attendance)
    return attendance

def build_attendance(n_students, start_date, end_date):
    columns = ['date', 'student_id', 'attendance']
    n_days = len(pd.date_range(start_date, end_date))
    
    dates       = _make_attendance_dates(n_students, start_date, end_date)
    student_ids = _make_student_ids(n_students, n_days)
    attendance  = _make_attendance(n_students, n_days)
    data = [xx for xx in zip(dates, student_ids, attendance)]
    
    df = pd.DataFrame(data=data, columns=columns)
    return df

################################################################################
# District All Students Table                                                  #
################################################################################
def _make_school_ids(n_students):
    schools = ['South River High School',
               'New Brunswick High School',
               'East Brunswick High School',
               'Edison High School']
    return list(np.random.choice(schools, n_students))

def _make_grade_levels(n_students):
    grades = ['Freshman', 'Sophomore', 'Junior', 'Senior']
    return list(np.random.choice(grades, n_students))
    
def _make_DOBs(grade_levels):
    birth_years = {
        'Freshman': 2005,
        'Sophomore': 2004,
        'Junior': 2003,
        'Senior': 2002
    }
    years = [birth_years[xx] for xx in grade_levels]
    months = list(np.random.choice(np.arange(1,13), len(grade_levels)))
    days = list(np.random.choice(np.arange(1,29), len(grade_levels)))
    DOBs = pd.to_datetime(['{}-{}-{}'.format(*dd) for dd in zip(months, days, years)])
    return DOBs

def _make_hometowns(school_ids):
    hometowns = [school.split('High School')[0].strip() for school in school_ids]
    return hometowns
    

def build_all_students(n_students):
    student_ids  = _make_student_ids(n_students)
    school_ids   = _make_school_ids(n_students)
    grade_levels = _make_grade_levels(n_students)
    DOBs         = _make_DOBs(grade_levels)
    hometowns    = _make_hometowns(school_ids)
    
    columns = ['student_id', 'school_id', 'grade_level', 'date_of_birth', 'hometown']
    data = [xx for xx in zip(student_ids, school_ids, grade_levels, DOBs, hometowns)]
    df = pd.DataFrame(data=data, columns=columns)
    return df
    

attendance = build_attendance(n_students, start_date, end_date)
all_students = build_all_students(n_students=n_students)

In [216]:
attendance.tail()

Unnamed: 0,date,student_id,attendance
287995,2018-06-15,1095,1
287996,2018-06-15,1096,1
287997,2018-06-15,1097,1
287998,2018-06-15,1098,1
287999,2018-06-15,1099,0


In [217]:
all_students.head()

Unnamed: 0,student_id,school_id,grade_level,date_of_birth,hometown
0,100,South River High School,Sophomore,2004-07-12,South River
1,101,East Brunswick High School,Freshman,2005-09-09,East Brunswick
2,102,East Brunswick High School,Junior,2003-05-19,East Brunswick
3,103,Edison High School,Senior,2002-06-28,Edison
4,104,South River High School,Junior,2003-07-05,South River


#### What percent of students attend school on their birthday?

In [218]:
all_attendance = pd.merge(
    left=attendance,
    right=all_students,
    how='inner',
    on='student_id'
)

attended = (all_attendance['attendance'] == 1)

attendance_month = all_attendance['date'].apply(lambda x: x.month)
attendance_day = all_attendance['date'].apply(lambda x: x.day)
birthday_month = all_attendance['date_of_birth'].apply(lambda x: x.month)
birthday_day = all_attendance['date_of_birth'].apply(lambda x: x.day)

on_birthday = (attendance_month == birthday_month) & (attendance_day == birthday_day)

attended_on_birthday = all_attendance[attended & on_birthday]

num_attended_on_birthday = float(attended_on_birthday.shape[0])
num_total = float(all_students.shape[0])

pct_attended_on_birthday = (num_attended_on_birthday / num_total) * 100.
pct_attended_on_birthday

42.1

**Which grade level had the largest drop in attendance between yesterday and today?**

In [200]:
date_attendance = all_attendance[all_attendance['date'] == today]

In [236]:
today = pd.to_datetime('2017-09-04')
yesterday = pd.to_datetime('2017-09-03')

def get_attendance(date):
    date_attendance = all_attendance[all_attendance['date'] == date]
    date_attendance = date_attendance[['grade_level', 'attendance']]
    date_attendance = date_attendance[date_attendance['attendance'] == 1]
    num_attended = date_attendance.groupby('grade_level').sum()
    return num_attended

today_attendance = get_attendance(today)
yesterday_attendance = get_attendance(yesterday)

drop_attendance = today_attendance - yesterday_attendance
drop_attendance.idxmax()['attendance']

'Sophomore'

## LeetCode <a id='leetcode'></a>

### Second/N-th Highest Salary

<div class="question-description__3U1T"><div><p>Write a SQL query to get the second highest salary from the <code>Employee</code> table.</p>

<pre>+----+--------+
| Id | Salary |
+----+--------+
| 1  | 100    |
| 2  | 200    |
| 3  | 300    |
+----+--------+
</pre>

<p>For example, given the above Employee table, the query should return <code>200</code> as the second highest salary. If there is no second highest salary, then the query should return <code>null</code>.</p>

<pre>+---------------------+
| SecondHighestSalary |
+---------------------+
| 200                 |
+---------------------+
</pre>
</div></div>

In [248]:
import pandas as pd
import numpy as np

num_employees = 10

def build_employee_table(num_employees):
    salaries = np.random.choice(np.arange(100, 1000, 100), 10)
    Employee = pd.DataFrame(salaries, columns=['salary'])
    return Employee

Employee = build_employee_table(num_employees)

def get_nth_highest_salary(employee_table, n):
    # Note: 
    #   * can use pd.unique() or Series.unique()
    #   * for pd.unique() can pass it any array like object: Series, np.array, list
    salaries = employee_table['salary'].unique()
    
    # Note: np.sort() returns a sorted copy
    #   ndarray.sort() sorts in place
    #   can't specify ascending/descneding
    salaries_sort = np.sort(salaries)
    return salaries_sort, salaries_sort[-n]
    
print(Employee)
print(get_nth_highest_salary(Employee, 2))

   salary
0     300
1     300
2     100
3     200
4     400
5     600
6     800
7     900
8     900
9     400
(array([100, 200, 300, 400, 600, 800, 900]), 800)


### Customers Who Never Order

<div class="question-detail"><div class="question-description__3U1T"><div><p>Suppose that a website contains two tables, the <code>Customers</code> table and the <code>Orders</code> table. Write a SQL query to find all customers who never order anything.</p>

<p>Table: <code>Customers</code>.</p>

<pre>+----+-------+
| Id | Name  |
+----+-------+
| 1  | Joe   |
| 2  | Henry |
| 3  | Sam   |
| 4  | Max   |
+----+-------+
</pre>

<p>Table: <code>Orders</code>.</p>

<pre>+----+------------+
| Id | CustomerId |
+----+------------+
| 1  | 3          |
| 2  | 1          |
+----+------------+
</pre>

<p>Using the above tables as example, return the following:</p>

<pre>+-----------+
| Customers |
+-----------+
| Henry     |
| Max       |
+-----------+
</pre>
</div></div></div>

In [267]:
import pandas as pd
import numpy as np

# Notes:
#   * Set index name after construction
Customers = pd.DataFrame(
    data=['Joe', 'Henry', 'Sam', 'Max'],
    index=np.arange(1,5),
    columns=['Name']
)
Customers.index.name = 'Id'

Orders = pd.DataFrame(
    data=[3, 1],
    index=np.arange(1,3),
    columns=['CustomerId']
)

Customer_Orders = pd.merge(
    left=Orders,
    right=Customers,
    how='outer',
    left_on=['CustomerId'],
    right_on=['Id']
)

# To get nan's use pd.isnull()
never_ordered = Customer_Orders[pd.isnull(Customer_Orders['CustomerId'])]['Name']
never_ordered = Customer_Orders[Customer_Orders['CustomerId'].isna()]['Name']
never_ordered

2    Henry
3      Max
Name: Name, dtype: object

### Friend Requests I: Overall Acceptance Rate

<div class="question-area"><div class="question-detail"><div class="question-description__3U1T"><div>In social network like Facebook or Twitter, people send friend requests and accept others’ requests as well. Now given two tables as below:<p></p>

Table: <code>friend_request</code>
<pre>| sender_id | send_to_id |request_date|
|-----------|------------|------------|
| 1         | 2          | 2016_06-01 |
| 1         | 3          | 2016_06-01 |
| 1         | 4          | 2016_06-01 |
| 2         | 3          | 2016_06-02 |
| 3         | 4          | 2016-06-09 |
</pre><p></p>

Table: <code>request_accepted</code>
<pre>| requester_id | accepter_id |accept_date |
|--------------|-------------|------------|
| 1            | 2           | 2016_06-03 |
| 1            | 3           | 2016-06-08 |
| 2            | 3           | 2016-06-08 |
| 3            | 4           | 2016-06-09 |
| 3            | 4           | 2016-06-10 |
</pre><p></p>

Write a query to find the overall acceptance rate of requests rounded to 2 decimals, which is the number of acceptance divide the number of requests.<p></p>

For the sample data above, your query should return the following result.<p></p>
<pre>|accept_rate|
|-----------|
|       0.80|
</pre><p></p>

<b>Note:</b>
<li>The accepted requests are not necessarily from the table <code>friend_request</code>. In this case, you just need to simply count the total accepted requests (no matter whether they are in the original requests), and divide it by the number of requests to get the acceptance rate.</li>
<li>It is possible that a sender sends multiple requests to the same receiver, and a request could be accepted more than once. In this case, the ‘duplicated’ requests or acceptances are only counted once.</li>
<li>If there is no requests at all, you should return 0.00 as the accept_rate. </li>
<p></p>

<b>Explanation:</b> There are 4 unique accepted requests, and there are 5 requests in total. So the rate is 0.80.<p></p>

<b>Follow-up:</b><br>
<li>Can you write a query to return the accept rate but for every month?</li>
<li>How about the cumulative accept rate for every day?</li></div></div></div></div>

In [298]:
def build_friend_request():
    friend_request = {
        'sender_id':  [1, 1, 1, 2, 3],
        'send_to_id': [2, 3, 4, 3, 4],
        'request_date': pd.to_datetime([
            '2016-06-01',
            '2016-06-01',
            '2016-06-01',
            '2016-06-02',
            '2016-06-09'
        ])
    }
    return pd.DataFrame(friend_request)
    
def build_request_accepted():
    request_accepted = {
        'requester_id':  [1, 1, 2, 3, 3],
        'accepter_id': [2, 3, 3, 4, 4],
        'accept_date': pd.to_datetime([
            '2016-06-03',
            '2016-06-08',
            '2016-06-08',
            '2016-06-09',
            '2016-06-10'
        ])
    }
    return pd.DataFrame(request_accepted)
    
friend_request = build_friend_request()
friend_request_unique = friend_request.drop_duplicates(subset=['sender_id', 'send_to_id'])

request_accepted = build_request_accepted()
request_accepted_unique = request_accepted.drop_duplicates(subset=['requester_id', 'accepter_id'])

acceptance_rate = float(request_accepted_unique.shape[0]) / friend_request_unique.shape[0]

acceptance_rate    

0.8

In [360]:
# Follow-Ups

def build_friend_request():
    friend_request = {
        'sender_id':  [1, 1, 1, 2, 3, 5, 6, 5],
        'send_to_id': [2, 3, 4, 3, 4, 4, 8, 6],
        'request_date': pd.to_datetime([
            '2016-06-01',
            '2016-06-01',
            '2016-06-01',
            '2016-06-02',
            '2016-06-09',
            '2016-07-02',
            '2016-07-03',
            '2016-07-10'
        ])
    }
    return pd.DataFrame(friend_request)
    
def build_request_accepted():
    request_accepted = {
        'requester_id':  [1, 1, 2, 3, 3, 8, 7, 6],
        'accepter_id':   [2, 3, 3, 4, 4, 9, 5, 5],
        'accept_date': pd.to_datetime([
            '2016-06-03',
            '2016-06-08',
            '2016-06-08',
            '2016-06-09',
            '2016-06-10',
            '2016-07-10',
            '2016-07-10',
            '2016-08-10'
        ])
    }
    return pd.DataFrame(request_accepted)
    
fr = build_friend_request()
fr_unique = fr.drop_duplicates(subset=['sender_id', 'send_to_id'])

ra = build_request_accepted()
ra_unique = ra.drop_duplicates(subset=['requester_id', 'accepter_id'])

fr_unique.index = fr_unique.pop('request_date')
ra_unique.index = ra_unique.pop('accept_date')

# Group By Month & Day

month_key = lambda x: x.month
day_key = lambda x: x.day

# Month
fr_month_counts = fr_unique.groupby(by=[month_key]).count().pop('sender_id')
fr_month_counts.name = 'fr_month_counts'
fr_month_counts = pd.DataFrame(fr_month_counts.apply(float))

ra_month_counts = ra_unique.groupby(by=[month_key]).count().pop('accepter_id')
ra_month_counts.name = 'ra_month_counts'
ra_month_counts = pd.DataFrame(ra_month_counts.apply(float))

accept_rate_month = pd.merge(
    left=ra_month_counts,
    right=fr_month_counts,
    how='outer',
    left_index=True,
    right_index=True
)

accept_rate_month['acceptance_rate'] = accept_rate_month.apply(lambda x: x['ra_month_counts'] / x['fr_month_counts'], axis=1)
accept_rate_month = accept_rate_month.fillna(0.0)

# Day
fr_month_day_counts = fr_unique.groupby(by=[month_key, day_key]).count().pop('sender_id')
fr_month_day_counts.name = 'fr_month_day_counts'
fr_month_day_counts = pd.DataFrame(fr_month_day_counts.apply(float))

ra_month_day_counts = ra_unique.groupby(by=[month_key, day_key]).count().pop('accepter_id')
ra_month_day_counts.name = 'ra_month_day_counts'
ra_month_day_counts = pd.DataFrame(ra_month_day_counts.apply(float))
print(ra_month_day_counts)
print(fr_month_day_counts)

accept_rate_month_day = pd.merge(
    left=ra_month_day_counts,
    right=fr_month_day_counts,
    how='outer',
    left_index=True,
    right_index=True
)


ar_md_dict = {}
months = []
days = []
ar_list = []
for row in accept_rate_month_day.iterrows():
    this_month = row[0][0]
    this_day = row[0][1]
    this_fr = row[1].fr_month_day_counts
    this_ra = row[1].ra_month_day_counts
    this_accept_rate = this_ra / this_fr
    months.append(this_month)
    days.append(this_day)
    ar_list.append(this_accept_rate)
    
ar_dict = {
    'month': months,
    'day': days,
    'accept_rate': ar_list
}

accept_rate_month_day = pd.DataFrame(ar_dict).fillna(0)
accept_rate_month_day
    
    

      ra_month_day_counts
6 3                   1.0
  8                   2.0
  9                   1.0
7 10                  2.0
8 10                  1.0
      fr_month_day_counts
6 1                   3.0
  2                   1.0
  9                   1.0
7 2                   1.0
  3                   1.0
  10                  1.0


Unnamed: 0,accept_rate,day,month
0,0.0,1,6
1,0.0,2,6
2,0.0,3,6
3,0.0,8,6
4,1.0,9,6
5,0.0,2,7
6,0.0,3,7
7,2.0,10,7
8,0.0,10,8


In [380]:
# Should have done it this way
accept_rate_month_day = pd.merge(
    left=ra_month_day_counts,
    right=fr_month_day_counts,
    how='outer',
    left_index=True,
    right_index=True
)

def get_md_arr(idx):
    s = accept_rate_month_day.loc[idx]
    return s.ra_month_day_counts / s.fr_month_day_counts
accept_rate_month_day.apply(lambda x: get_md_arr(x.name), axis=1).fillna(0.0)



6  1     0.0
   2     0.0
   3     0.0
   8     0.0
   9     1.0
7  2     0.0
   3     0.0
   10    2.0
8  10    0.0
dtype: float64