In [1]:
import pandas as pd
import numpy as np


In [2]:
#Data sources
eng_url     = 'https://andybek.com/pandas-eng'
state_url   = 'https://andybek.com/pandas-state'
party_url   = 'https://andybek.com/pandas-party'
liberal_url = 'https://andybek.com/pandas-liberal'
ivies_url   = 'https://andybek.com/pandas-ivies'

In [3]:
eng = pd.read_csv(eng_url)

In [4]:
state = pd.read_csv(state_url)

In [5]:
party = pd.read_csv(party_url)

In [12]:
liberal = pd.read_csv(liberal_url)

In [7]:
ivies = pd.read_csv(ivies_url)

In [14]:
ivies

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00"
1,Princeton University,Ivy League,"$66,500.00","$131,000.00"
2,Yale University,Ivy League,"$59,100.00","$126,000.00"
3,Harvard University,Ivy League,"$63,400.00","$124,000.00"
4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00"
5,Cornell University,Ivy League,"$60,300.00","$110,000.00"
6,Brown University,Ivy League,"$56,200.00","$109,000.00"
7,Columbia University,Ivy League,"$59,400.00","$107,000.00"


# **Concatenating DataFrames**

In [17]:
dfs = [eng, state, party, liberal, ivies]
for df in dfs:
  print(df.shape)

(19, 4)
(175, 4)
(20, 4)
(47, 4)
(8, 4)


In [23]:
pd.concat(dfs)[pd.concat(dfs).duplicated(subset=['School Name'],keep='first')]

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,University of Illinois at Urbana-Champaign (UIUC),Party,"$52,900.00","$96,100.00"
1,"University of Maryland, College Park",Party,"$52,000.00","$95,000.00"
2,"University of California, Santa Barbara (UCSB)",Party,"$50,500.00","$95,000.00"
3,University of Texas (UT) - Austin,Party,"$49,700.00","$93,900.00"
4,State University of New York (SUNY) at Albany,Party,"$44,500.00","$92,200.00"
5,University of Florida (UF),Party,"$47,100.00","$87,900.00"
6,Louisiana State University (LSU),Party,"$46,900.00","$87,800.00"
7,University of Georgia (UGA),Party,"$44,100.00","$86,000.00"
8,Pennsylvania State University (PSU),Party,"$49,900.00","$85,700.00"
9,Arizona State University (ASU),Party,"$47,400.00","$84,100.00"


In [25]:
schools = pd.concat(dfs).drop_duplicates(subset=['School Name'],keep='first')

# Handling Duplicate Indices

In [29]:
#Handling duplicate indices which are ignored during concatenation method
schools.reset_index(drop=True,inplace=True)

In [30]:
schools.index.duplicated().sum()

np.int64(0)

In [31]:
#Using Multi-indexing
new_df = pd.concat(dfs, keys=['eng','state','party','liberal','ivies'])
new_df

Unnamed: 0,Unnamed: 1,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
eng,0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00"
eng,1,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00"
eng,2,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00"
eng,3,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00"
eng,4,Cooper Union,Engineering,"$62,200.00","$114,000.00"
...,...,...,...,...,...
ivies,3,Harvard University,Ivy League,"$63,400.00","$124,000.00"
ivies,4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00"
ivies,5,Cornell University,Ivy League,"$60,300.00","$110,000.00"
ivies,6,Brown University,Ivy League,"$56,200.00","$109,000.00"


In [39]:
new_df.loc['ivies',3].loc['School Type']

'Ivy League'

### **Skill Chanllenge # 1**

In [47]:
#Concatenate liberal and state dataframes into a new dataframe and find the count of unique school names.
dfa = pd.concat([liberal,state])
dfa.loc[:,'School Name'].nunique()

222

In [50]:
dfa.loc[:,'Starting Median Salary'].replace(r'\$|,','',regex=True).astype(float).mean()

np.float64(44469.36936936937)

In [58]:
liberal.loc[liberal.loc[:,'Mid-Career Median Salary'].replace(r'\$|,','',regex=True).astype(float).nlargest(3).index]

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Bucknell University,Liberal Arts,"$54,100.00","$110,000.00"
1,Colgate University,Liberal Arts,"$52,800.00","$108,000.00"
2,Amherst College,Liberal Arts,"$54,500.00","$107,000.00"


In [65]:
liberal.loc[liberal.loc[:,'Mid-Career Median Salary'].replace(r'\$|,','',regex=True).astype(float).sort_values(ascending=False).index]

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Bucknell University,Liberal Arts,"$54,100.00","$110,000.00"
1,Colgate University,Liberal Arts,"$52,800.00","$108,000.00"
2,Amherst College,Liberal Arts,"$54,500.00","$107,000.00"
3,Lafayette College,Liberal Arts,"$53,900.00","$107,000.00"
4,Bowdoin College,Liberal Arts,"$48,100.00","$107,000.00"
5,College of the Holy Cross,Liberal Arts,"$50,200.00","$106,000.00"
6,Occidental College,Liberal Arts,"$51,900.00","$105,000.00"
7,Washington and Lee University,Liberal Arts,"$53,600.00","$104,000.00"
8,Swarthmore College,Liberal Arts,"$49,700.00","$104,000.00"
9,Davidson College,Liberal Arts,"$46,100.00","$104,000.00"


## **Merging of Datasets**

In [67]:
#regional information of schools
regions_url = 'https://andybek.com/pandas-regions'
regions = pd.read_csv(regions_url)

In [68]:
regions

Unnamed: 0,School Name,Region
0,Massachusetts Institute of Technology (MIT),Northeastern
1,California Institute of Technology (CIT),California
2,Harvey Mudd College,California
3,"Polytechnic University of New York, Brooklyn",Northeastern
4,Cooper Union,Northeastern
...,...,...
264,Austin Peay State University,Southern
265,Pittsburg State University,Midwestern
266,Southern Utah University,Western
267,Montana State University - Billings,Western


In [69]:
#Merge schools and regions dataframes
pd.merge(schools,regions,on='School Name')

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region
0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00",Northeastern
1,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00",California
2,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00",California
3,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00",Northeastern
4,Cooper Union,Engineering,"$62,200.00","$114,000.00",Northeastern
...,...,...,...,...,...
264,Harvard University,Ivy League,"$63,400.00","$124,000.00",Northeastern
265,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00",Northeastern
266,Cornell University,Ivy League,"$60,300.00","$110,000.00",Northeastern
267,Brown University,Ivy League,"$56,200.00","$109,000.00",Northeastern


In [71]:
#Merging when the column names are different
income_url = 'https://andybek.com/pandas-mid'
income = pd.read_csv(income_url)

In [72]:
income

Unnamed: 0,school_name,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Massachusetts Institute of Technology (MIT),"$76,800.00","$99,200.00","$168,000.00","$220,000.00"
1,California Institute of Technology (CIT),,"$104,000.00","$161,000.00",
2,Harvey Mudd College,,"$96,000.00","$180,000.00",
3,"Polytechnic University of New York, Brooklyn","$66,800.00","$94,300.00","$143,000.00","$190,000.00"
4,Cooper Union,,"$80,200.00","$142,000.00",
...,...,...,...,...,...
264,Austin Peay State University,"$32,200.00","$40,500.00","$73,900.00","$96,200.00"
265,Pittsburg State University,"$25,600.00","$46,000.00","$84,600.00","$117,000.00"
266,Southern Utah University,"$30,700.00","$39,700.00","$78,400.00","$116,000.00"
267,Montana State University - Billings,"$22,600.00","$31,800.00","$78,500.00","$98,900.00"


In [75]:
#Merge between income and schools dataframes when the column names are different
pd.merge(schools,income,left_on='School Name',right_on='school_name').drop('school_name',axis=1)

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00","$76,800.00","$99,200.00","$168,000.00","$220,000.00"
1,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",
2,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00",,"$96,000.00","$180,000.00",
3,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00","$66,800.00","$94,300.00","$143,000.00","$190,000.00"
4,Cooper Union,Engineering,"$62,200.00","$114,000.00",,"$80,200.00","$142,000.00",
...,...,...,...,...,...,...,...,...
264,Harvard University,Ivy League,"$63,400.00","$124,000.00","$54,800.00","$86,200.00","$179,000.00","$288,000.00"
265,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00","$55,900.00","$79,200.00","$192,000.00","$282,000.00"
266,Cornell University,Ivy League,"$60,300.00","$110,000.00","$56,800.00","$79,800.00","$160,000.00","$210,000.00"
267,Brown University,Ivy League,"$56,200.00","$109,000.00","$55,400.00","$74,400.00","$159,000.00","$228,000.00"


### **Joining of DataFrames using Merge method**

In [76]:
#inner join
pd.merge(ivies,regions,how='inner')

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Northeastern
1,Princeton University,Ivy League,"$66,500.00","$131,000.00",Northeastern
2,Yale University,Ivy League,"$59,100.00","$126,000.00",Northeastern
3,Harvard University,Ivy League,"$63,400.00","$124,000.00",Northeastern
4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00",Northeastern
5,Cornell University,Ivy League,"$60,300.00","$110,000.00",Northeastern
6,Brown University,Ivy League,"$56,200.00","$109,000.00",Northeastern
7,Columbia University,Ivy League,"$59,400.00","$107,000.00",Northeastern


In [77]:
#outer join
pd.merge(ivies,regions,how='outer')

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region
0,Amherst College,,,,Northeastern
1,Appalachian State University,,,,Southern
2,Arizona State University (ASU),,,,Western
3,Arizona State University (ASU),,,,Western
4,Arkansas State University (ASU),,,,Southern
...,...,...,...,...,...
264,Whitman College,,,,Western
265,Williams College,,,,Northeastern
266,Wittenberg University,,,,Midwestern
267,Worcester Polytechnic Institute (WPI),,,,Northeastern


In [78]:
#left outer join
pd.merge(ivies,regions,how='left')

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Northeastern
1,Princeton University,Ivy League,"$66,500.00","$131,000.00",Northeastern
2,Yale University,Ivy League,"$59,100.00","$126,000.00",Northeastern
3,Harvard University,Ivy League,"$63,400.00","$124,000.00",Northeastern
4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00",Northeastern
5,Cornell University,Ivy League,"$60,300.00","$110,000.00",Northeastern
6,Brown University,Ivy League,"$56,200.00","$109,000.00",Northeastern
7,Columbia University,Ivy League,"$59,400.00","$107,000.00",Northeastern


In [79]:
#right outer join
pd.merge(ivies,regions,how='right')

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region
0,Massachusetts Institute of Technology (MIT),,,,Northeastern
1,California Institute of Technology (CIT),,,,California
2,Harvey Mudd College,,,,California
3,"Polytechnic University of New York, Brooklyn",,,,Northeastern
4,Cooper Union,,,,Northeastern
...,...,...,...,...,...
264,Austin Peay State University,,,,Southern
265,Pittsburg State University,,,,Midwestern
266,Southern Utah University,,,,Western
267,Montana State University - Billings,,,,Western


In [93]:
#Many-to-Many Relationships between DataFrames
survey = pd.DataFrame(
    {
        'School Type':['Ivy League','Ivy League','Engineering','Engineering'],
        'Prestige':['High','Good','Good','High'],
        'Respondent':[1,2,3,4]
    }
)

In [94]:
survey = pd.concat(
    [
        survey,
        pd.DataFrame(
            {
                'School Type':['Ivy League'],
                'Prestige':['Very High'],
                'Respondent':[5]
            }
        )
    ],
    ignore_index=True
)

In [95]:
survey

Unnamed: 0,School Type,Prestige,Respondent
0,Ivy League,High,1
1,Ivy League,Good,2
2,Engineering,Good,3
3,Engineering,High,4
4,Ivy League,Very High,5


In [102]:
pd.merge(ivies,survey)

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Prestige,Respondent
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",High,1
1,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Good,2
2,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Very High,5
3,Princeton University,Ivy League,"$66,500.00","$131,000.00",High,1
4,Princeton University,Ivy League,"$66,500.00","$131,000.00",Good,2
5,Princeton University,Ivy League,"$66,500.00","$131,000.00",Very High,5
6,Yale University,Ivy League,"$59,100.00","$126,000.00",High,1
7,Yale University,Ivy League,"$59,100.00","$126,000.00",Good,2
8,Yale University,Ivy League,"$59,100.00","$126,000.00",Very High,5
9,Harvard University,Ivy League,"$63,400.00","$124,000.00",High,1


In [103]:
pd.merge(ivies,survey)['School Name'].nunique()

8

In [105]:
#Merging by Index
iv1 = ivies.set_index('School Name')
region1 = regions.set_index('School Name')

In [107]:
iv1.index

Index(['Dartmouth College', 'Princeton University', 'Yale University',
       'Harvard University', 'University of Pennsylvania',
       'Cornell University', 'Brown University', 'Columbia University'],
      dtype='object', name='School Name')

In [108]:
pd.merge(iv1,region1,left_index=True,right_index=True)

Unnamed: 0_level_0,School Type,Starting Median Salary,Mid-Career Median Salary,Region
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Northeastern
Princeton University,Ivy League,"$66,500.00","$131,000.00",Northeastern
Yale University,Ivy League,"$59,100.00","$126,000.00",Northeastern
Harvard University,Ivy League,"$63,400.00","$124,000.00",Northeastern
University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00",Northeastern
Cornell University,Ivy League,"$60,300.00","$110,000.00",Northeastern
Brown University,Ivy League,"$56,200.00","$109,000.00",Northeastern
Columbia University,Ivy League,"$59,400.00","$107,000.00",Northeastern


In [111]:
# Merging by Index with column
pd.merge(iv1,regions,left_index=True,right_on='School Name')

Unnamed: 0,School Type,Starting Median Salary,Mid-Career Median Salary,School Name,Region
86,Ivy League,"$58,000.00","$134,000.00",Dartmouth College,Northeastern
87,Ivy League,"$66,500.00","$131,000.00",Princeton University,Northeastern
88,Ivy League,"$59,100.00","$126,000.00",Yale University,Northeastern
89,Ivy League,"$63,400.00","$124,000.00",Harvard University,Northeastern
90,Ivy League,"$60,900.00","$120,000.00",University of Pennsylvania,Northeastern
91,Ivy League,"$60,300.00","$110,000.00",Cornell University,Northeastern
92,Ivy League,"$56,200.00","$109,000.00",Brown University,Northeastern
93,Ivy League,"$59,400.00","$107,000.00",Columbia University,Northeastern


## **Join Method**

In [125]:
ivies.join(schools,lsuffix='_ivy')

Unnamed: 0,School Name_ivy,School Type_ivy,Starting Median Salary_ivy,Mid-Career Median Salary_ivy,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00",Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00"
1,Princeton University,Ivy League,"$66,500.00","$131,000.00",California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00"
2,Yale University,Ivy League,"$59,100.00","$126,000.00",Harvey Mudd College,Engineering,"$71,800.00","$122,000.00"
3,Harvard University,Ivy League,"$63,400.00","$124,000.00","Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00"
4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00",Cooper Union,Engineering,"$62,200.00","$114,000.00"
5,Cornell University,Ivy League,"$60,300.00","$110,000.00",Worcester Polytechnic Institute (WPI),Engineering,"$61,000.00","$114,000.00"
6,Brown University,Ivy League,"$56,200.00","$109,000.00",Carnegie Mellon University (CMU),Engineering,"$61,800.00","$111,000.00"
7,Columbia University,Ivy League,"$59,400.00","$107,000.00",Rensselaer Polytechnic Institute (RPI),Engineering,"$61,100.00","$110,000.00"


In [117]:
ivies

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Dartmouth College,Ivy League,"$58,000.00","$134,000.00"
1,Princeton University,Ivy League,"$66,500.00","$131,000.00"
2,Yale University,Ivy League,"$59,100.00","$126,000.00"
3,Harvard University,Ivy League,"$63,400.00","$124,000.00"
4,University of Pennsylvania,Ivy League,"$60,900.00","$120,000.00"
5,Cornell University,Ivy League,"$60,300.00","$110,000.00"
6,Brown University,Ivy League,"$56,200.00","$109,000.00"
7,Columbia University,Ivy League,"$59,400.00","$107,000.00"


In [114]:
regions

Unnamed: 0,School Name,Region
0,Massachusetts Institute of Technology (MIT),Northeastern
1,California Institute of Technology (CIT),California
2,Harvey Mudd College,California
3,"Polytechnic University of New York, Brooklyn",Northeastern
4,Cooper Union,Northeastern
...,...,...
264,Austin Peay State University,Southern
265,Pittsburg State University,Midwestern
266,Southern Utah University,Western
267,Montana State University - Billings,Western


# Skill Challenge # 2

In [126]:
#Merge liberal arts school with regions and assign the resluting dataframe to dfm. What region has highest number of liberal arts school?

In [128]:
dfm = pd.merge(liberal,regions)

In [131]:
dfm['Region'].value_counts()

Unnamed: 0_level_0,count
Region,Unnamed: 1_level_1
Northeastern,25
Midwestern,8
Western,7
Southern,5
California,3


In [132]:
#Set school_name as the index of income dataframe.

In [133]:
income.set_index('school_name',inplace=True)

In [135]:
#Merge the dfm and income dataframes

In [143]:
dfm_merge = pd.merge(dfm,income,left_on='School Name',right_index=True)

In [146]:
dfm_merge[dfm_merge.duplicated(subset=['School Name'],keep='first')]

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Region,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
28,Randolph-Macon College,Liberal Arts,"$42,600.00","$83,600.00",Southern,,"$54,100.00","$123,000.00",
29,Randolph-Macon College,Liberal Arts,"$42,600.00","$83,600.00",Southern,,"$54,100.00","$123,000.00",
29,Randolph-Macon College,Liberal Arts,"$42,600.00","$83,600.00",Southern,,"$54,100.00","$123,000.00",
