# Merge 

In [1]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

In [3]:
# Set to 2 decimal places, and display all columns
pd.set_option('display.float_format', lambda x: '%.2f' % x)

Load the survey data

In [23]:
df  = pd.read_csv('cleaned_survey.csv', index_col=0)

In [25]:
df.head()

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,...,0,1.0,4,4,0,1,0,0,6.0,1
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0,0.0,2,2,0,0,0,1,4.0,1
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,...,0,1.0,3,3,0,0,1,0,3.0,1
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,...,0,1.0,2,3,0,0,0,1,5.0,1
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,0,0.0,1,1,0,0,1,0,4.0,1


In [27]:
# Check the length of the dataframe
len(df)

61

Let us assume that we also have a dataframe <i>df_programs</i>, which contains the units required to complete the graduate programs at our business school.

In [29]:
# create a dataframe with two columns: 'Program' and 'Units_required'
df_programs = pd.DataFrame({'Program' : \
    ['MSIS', 'MBA', 'Master of Finance', 'Supply Chain Mgmt & Analytics', 'Master of Hacking'],\
    'Units_required' : [51, 70, 48, 49, 100]})

Note that Master of Hacking (unfortunately) does not actually exist... 

In [31]:
df_programs

Unnamed: 0,Program,Units_required
0,MSIS,51
1,MBA,70
2,Master of Finance,48
3,Supply Chain Mgmt & Analytics,49
4,Master of Hacking,100


In [33]:
# check unique values in dataframe df's Program column
df.Program.unique()

array(['MSIS', 'Supply Chain Mgmt & Analytics', 'MBA', 'Faculty!',
       'Business Man', 'Master of Finance'], dtype=object)

In [35]:
# check unique values in dataframe df_programs's Program column
df_programs.Program.unique()

array(['MSIS', 'MBA', 'Master of Finance',
       'Supply Chain Mgmt & Analytics', 'Master of Hacking'], dtype=object)

In [37]:
# Check the length of the dataframe df again
len(df)

61

In [39]:
# Check the length of the dataframe df_programs
len(df_programs)

5

In [43]:
# Check the columns with Program values in df but not in df_programs
df[(df.Program=='Business Man') | (df.Program=='Faculty!')]

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert
16,1.0,Faculty!,3,1,0,0.0,0,0.0,0.0,0.0,...,0,1.0,5,5,0,0,0,1,3.0,1
31,1.0,Business Man,1,0,0,,0,0.0,1.0,1.0,...,0,1.0,2,3,0,0,0,1,2.0,0


In [45]:
# Check the columns with Program values in df_programs but not in df
df_programs[(df_programs.Program=='Master of Hacking')]

Unnamed: 0,Program,Units_required
4,Master of Hacking,100


## Merge on columns

A Merge operation ("join" in relational DBs) consists of joining the columns of two tables based on the equality of one or more columns. For example, we can add to <i>df</i> a column <i>Units_required</i>, which reports the units required by the program in which each student is enrolled.

### INNER MERGE (default)

Compact formulation: the merge will be performed on the columns with the same name in both tables. Merging <i>df</i> with <i>df_programs</i> will perform the merge on the column <i>Program</i>, because that is the only column with the same name.

Use this to expand the column display

pd.set_option('display.max_columns', 50)  and pd.set_option('display.max_rows', 100)

In [47]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [49]:
# Apply inner merge on df and df_programs
df.merge(df_programs)

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,1.0,0.0,1,0,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,1.0,0.0,1,0,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,0.0,1,1,0,0,1,0,4.0,1,51
5,1.0,Supply Chain Mgmt & Analytics,1,0,0,0.0,0,0.0,0.0,0.0,1.0,0.0,1,0,1.0,1,1,0,1,0,0,1.0,0,49
6,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,1,0,0,4.0,1,51
7,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,1.0,0.0,1,1,0.0,2,2,1,0,0,0,3.0,1,51
8,1.0,MBA,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0,1.0,1,1,0,0,0,1,0.0,0,70
9,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,1,1.0,2,1,0,0,0,1,4.0,1,51


Or we can specify the names of the columns with <i>left_on</i> (the column or list of columns on the "left" table) and <i>right_on</i> (the column or list of columns on the "right" table)

In [51]:
# Specify the names of the columns with left_on 'Program' and right_on 'Program'
df.merge(df_programs, left_on='Program',right_on='Program')

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,1.0,0.0,1,0,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,1.0,0.0,1,0,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,0.0,1,1,0,0,1,0,4.0,1,51
5,1.0,Supply Chain Mgmt & Analytics,1,0,0,0.0,0,0.0,0.0,0.0,1.0,0.0,1,0,1.0,1,1,0,1,0,0,1.0,0,49
6,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,1,0,0,4.0,1,51
7,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,1.0,0.0,1,1,0.0,2,2,1,0,0,0,3.0,1,51
8,1.0,MBA,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0,1.0,1,1,0,0,0,1,0.0,0,70
9,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,1,1.0,2,1,0,0,0,1,4.0,1,51


In [53]:
df_inner = df.merge(df_programs, left_on='Program',right_on='Program')

In [55]:
# What's the number of rows in df_inner?
len(df_inner)

59

In [57]:
# Show the three columns only: 'Program','ProgSkills','Units_required'
df_inner[['Program','ProgSkills','Units_required']]

Unnamed: 0,Program,ProgSkills,Units_required
0,MSIS,4,51
1,MSIS,3,51
2,MSIS,3,51
3,MSIS,3,51
4,MSIS,3,51
5,Supply Chain Mgmt & Analytics,1,49
6,MSIS,3,51
7,MSIS,2,51
8,MBA,1,70
9,MSIS,3,51


### LEFT MERGE

This is the equivalent of the left outer join in relational DBs. If a row on the left table finds no match, it will still appear in the result and the missing values will be filled with NAs.

In [61]:
# Apply left merge on df and df_programs
df.merge(df_programs,how='left')

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,1.0,0.0,1,0,1.0,4,4,0,1,0,0,6.0,1,51.0
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,0,0,1,4.0,1,51.0
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,1.0,3,3,0,0,1,0,3.0,1,51.0
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,1.0,0.0,1,0,1.0,2,3,0,0,0,1,5.0,1,51.0
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,0.0,1,1,0,0,1,0,4.0,1,51.0
5,1.0,Supply Chain Mgmt & Analytics,1,0,0,0.0,0,0.0,0.0,0.0,1.0,0.0,1,0,1.0,1,1,0,1,0,0,1.0,0,49.0
6,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,1,0,0,4.0,1,51.0
7,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,1.0,0.0,1,1,0.0,2,2,1,0,0,0,3.0,1,51.0
8,1.0,MBA,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0,1.0,1,1,0,0,0,1,0.0,0,70.0
9,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,1,1.0,2,1,0,0,0,1,4.0,1,51.0


In [63]:
# What's the number of rows?
len(df.merge(df_programs,how='left'))

61

### RIGHT MERGE

In [65]:
# Apply right merge on df and df_programs
df.merge(df_programs, how='right')

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,4.0,4.0,0.0,1.0,0.0,0.0,6.0,1.0,51
1,0.5,MSIS,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,4.0,1.0,51
2,0.0,MSIS,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,3.0,3.0,0.0,0.0,1.0,0.0,3.0,1.0,51
3,0.0,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,1.0,5.0,1.0,51
4,0.0,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,51
5,0.0,MSIS,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,4.0,1.0,51
6,0.0,MSIS,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,3.0,1.0,51
7,0.5,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,1.0,51
8,0.5,MSIS,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,1.0,51
9,0.0,MSIS,5.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,51


In [67]:
df.merge(df_programs, how='right')[['Program','ProgSkills','Languages','Expert','Units_required']]

Unnamed: 0,Program,ProgSkills,Languages,Expert,Units_required
0,MSIS,4.0,6.0,1.0,51
1,MSIS,3.0,4.0,1.0,51
2,MSIS,3.0,3.0,1.0,51
3,MSIS,3.0,5.0,1.0,51
4,MSIS,3.0,4.0,1.0,51
5,MSIS,3.0,4.0,1.0,51
6,MSIS,2.0,3.0,1.0,51
7,MSIS,3.0,4.0,1.0,51
8,MSIS,4.0,5.0,1.0,51
9,MSIS,5.0,4.0,1.0,51


In [69]:
# What's the number of rows?
len(df.merge(df_programs, how='right'))

60

### OUTER MERGE

In [71]:
# Apply outer merge on df and df_programs
df.merge(df_programs,how='outer')

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,1.0,Business Man,1.0,0.0,0.0,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,1.0,2.0,0.0,
1,1.0,Faculty!,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,1.0,3.0,1.0,
2,1.0,MBA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,70.0
3,0.5,MBA,4.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,5.0,1.0,70.0
4,1.0,MBA,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,70.0
5,1.0,MBA,5.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,5.0,1.0,70.0
6,1.0,MBA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,70.0
7,1.0,MBA,4.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,,1.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,1.0,3.0,1.0,70.0
8,0.5,MBA,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,70.0
9,0.0,MBA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,70.0


In [73]:
# What's the number of rows?
len(df.merge(df_programs,how='outer'))

62

## Merge on Indices

Let's create a new DataFrame, called <i>df_programs_i</i>, which is a copy of <i>df_programs</i> but with <i>Program</i> being an index instead of a column.

In [75]:
# Apply set_index to set Program as df_programs's index
df_program_i = df_programs.set_index('Program')

In [79]:
df_program_i

Unnamed: 0_level_0,Units_required
Program,Unnamed: 1_level_1
MSIS,51
MBA,70
Master of Finance,48
Supply Chain Mgmt & Analytics,49
Master of Hacking,100


To merge <i>df</i> (left table) with <i>df_index_i</i> (right table), we need to specify that we use the index on the right table (<b>right_index = True</b>).

In [83]:
# For the right dataframe, df_program_i, we would like to merge on it's index
df.merge(df_program_i, left_on='Program', right_index=True).head()

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,1.0,0.0,1,0,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,1.0,0.0,1,0,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,0.0,1,1,0,0,1,0,4.0,1,51


In [85]:
len(df.merge(df_program_i, left_on='Program', right_index=True))

59

## Problems

For each programming skills level, find the average number of units to be completed by students with that programming skill level


In [87]:
# Apply inner merge
df_complete=df.merge(df_programs, left_on='Program', right_on='Program')
df_complete

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,1.0,0.0,1,0,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,1.0,0.0,1,0,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,0,0.0,1,1,0,0,1,0,4.0,1,51
5,1.0,Supply Chain Mgmt & Analytics,1,0,0,0.0,0,0.0,0.0,0.0,1.0,0.0,1,0,1.0,1,1,0,1,0,0,1.0,0,49
6,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,2,2,0,1,0,0,4.0,1,51
7,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,1.0,0.0,1,1,0.0,2,2,1,0,0,0,3.0,1,51
8,1.0,MBA,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0,1.0,1,1,0,0,0,1,0.0,0,70
9,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,1,1.0,2,1,0,0,0,1,4.0,1,51


In [89]:
# Apply groupby on 'ProgSkills'
# We only need 'Units_required' column
df_complete.groupby('ProgSkills')['Units_required'].mean()

ProgSkills
1   63.33
2   57.73
3   54.39
4   53.92
5   60.50
Name: Units_required, dtype: float64

In [91]:
df_programs

Unnamed: 0,Program,Units_required
0,MSIS,51
1,MBA,70
2,Master of Finance,48
3,Supply Chain Mgmt & Analytics,49
4,Master of Hacking,100


In [93]:
df[df.ProgSkills==5]

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert
14,1.0,MBA,5,1,1,0.0,1,0.0,1.0,0.0,1.0,0.0,1,0,1.0,2,2,0,0,0,1,5.0,1
19,0.0,MSIS,5,1,0,0.0,1,1.0,0.0,0.0,1.0,0.0,1,1,0.0,1,1,0,0,1,0,4.0,1


For each existing program (i.e., for each Program in df_programs), find the units required to complete it and the number of students belonging to that program that responded to the survey. 

In [97]:
df.merge(df_programs,how='right').groupby('Program').\
        agg({'Units_required':'mean','C':'count'}).\
        rename(columns={'C':'df_student'})

Unnamed: 0_level_0,Units_required,df_student
Program,Unnamed: 1_level_1,Unnamed: 2_level_1
MBA,70.0,16
MSIS,51.0,40
Master of Finance,48.0,1
Master of Hacking,100.0,0
Supply Chain Mgmt & Analytics,49.0,2


In [99]:
df_RealProgram = df.merge(df_programs,how='right')

In [101]:
df_RealProgram

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,SQL,SAS,Excel,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,4.0,4.0,0.0,1.0,0.0,0.0,6.0,1.0,51
1,0.5,MSIS,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,4.0,1.0,51
2,0.0,MSIS,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,3.0,3.0,0.0,0.0,1.0,0.0,3.0,1.0,51
3,0.0,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,1.0,5.0,1.0,51
4,0.0,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,51
5,0.0,MSIS,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,4.0,1.0,51
6,0.0,MSIS,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,3.0,1.0,51
7,0.5,MSIS,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,1.0,51
8,0.5,MSIS,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,1.0,51
9,0.0,MSIS,5.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,51


For each person in df, the number of weekly hours they are working (course work + job), assuming that:
<ul>
<li>each required unit of coursework is 0.25 hours a week of work
<li>Job=0 is 0 hours a week of work
<li>Job=0.5 is 20 hours a week of work
<li>Job=1 is 40 hours a week of work
</ul>

In [103]:
# Since the question is asking for all people in df, we need to use left join to keep all observation in df
df_left = df.merge(df_programs,how='left')

In [105]:
# Generate weekly hrs_coursework column: coursework*0.25
df_left['hrs_coursework']=df_left['Units_required']*0.25

In [111]:
# Generate weekly hrs_job column based on the conditions
df_left['hrs_job'] = df.Job.apply(lambda x: 40 if x==1.0 else 20 if x==0.5 else 0)

In [113]:
# Total hours = weekly course work hours + job hours
df_left['hrs_total'] = df_left['hrs_job'] + df_left['hrs_coursework']

In [115]:
df_left[['hrs_coursework','hrs_job','hrs_total']]

Unnamed: 0,hrs_coursework,hrs_job,hrs_total
0,12.75,0,12.75
1,12.75,20,32.75
2,12.75,0,12.75
3,12.75,0,12.75
4,12.75,0,12.75
5,12.25,40,52.25
6,12.75,0,12.75
7,12.75,0,12.75
8,17.5,40,57.5
9,12.75,20,32.75
