# Dataset Creation (national_university_rankings.csv and majors.csv)

**Sources:**
- https://data.world/education/university-rankings-2017
- https://github.com/fivethirtyeight/data/tree/master/college-majors

In [59]:
# imports
import csv
import pandas as pd
import numpy as np

In [60]:
# open national_university_rankings.csv as a pandas dataframe
uni_df = pd.read_csv('national_university_rankings.csv', encoding = "ISO-8859-1")
uni_df.columns = ['University', 'Location', 'Rank', 'Description', 'Tuition and fees', 'In-state', 'Undergrad Enrollment']
uni_df

Unnamed: 0,University,Location,Rank,Description,Tuition and fees,In-state,Undergrad Enrollment
0,Princeton University,"Princeton, NJ",1,"Princeton, the fourth-oldest college in the Un...","$45,320",,5402
1,Harvard University,"Cambridge, MA",2,"Harvard is located in Cambridge, Massachusetts...","$47,074",,6699
2,University of Chicago,"Chicago, IL",3,"The University of Chicago, situated in Chicago...","$52,491",,5844
3,Yale University,"New Haven, CT",3,"Yale University, located in New Haven, Connect...","$49,480",,5532
4,Columbia University,"New York, NY",5,"Columbia University, located in Manhattan's Mo...","$55,056",,6102
...,...,...,...,...,...,...,...
226,University of Massachusetts--Dartmouth,"North Dartmouth, MA",220,"Located about 60 miles south of Boston, the Un...","$19,270","$12,588",7295
227,University of Missouri--St. Louis,"St. Louis, MO",220,Undergraduates at University of Missouri--St. ...,"$26,277","$10,065",13569
228,University of North Carolina--Greensboro,"Greensboro, NC",220,University of North Carolina--Greensboro is lo...,"$21,595","$6,733",15951
229,University of Southern Mississippi,"Hattiesburg, MS",220,The University of Southern Mississippi has two...,"$16,094","$7,224",11840


In [61]:
# replace all '--' with ', ' 
uni_df = uni_df.replace(r'--',', ', regex=True) 
uni_df

Unnamed: 0,University,Location,Rank,Description,Tuition and fees,In-state,Undergrad Enrollment
0,Princeton University,"Princeton, NJ",1,"Princeton, the fourth-oldest college in the Un...","$45,320",,5402
1,Harvard University,"Cambridge, MA",2,"Harvard is located in Cambridge, Massachusetts...","$47,074",,6699
2,University of Chicago,"Chicago, IL",3,"The University of Chicago, situated in Chicago...","$52,491",,5844
3,Yale University,"New Haven, CT",3,"Yale University, located in New Haven, Connect...","$49,480",,5532
4,Columbia University,"New York, NY",5,"Columbia University, located in Manhattan's Mo...","$55,056",,6102
...,...,...,...,...,...,...,...
226,"University of Massachusetts, Dartmouth","North Dartmouth, MA",220,"Located about 60 miles south of Boston, the Un...","$19,270","$12,588",7295
227,"University of Missouri, St. Louis","St. Louis, MO",220,"Undergraduates at University of Missouri, St. ...","$26,277","$10,065",13569
228,"University of North Carolina, Greensboro","Greensboro, NC",220,"University of North Carolina, Greensboro is lo...","$21,595","$6,733",15951
229,University of Southern Mississippi,"Hattiesburg, MS",220,The University of Southern Mississippi has two...,"$16,094","$7,224",11840


In [62]:
# check if universities in University column is unique
universities = uni_df.iloc[:, 0]
universities.is_unique

True

In [63]:
# open majors.csv as a pandas dataframe
majors_df = pd.read_csv('majors.csv', encoding = "ISO-8859-1")
majors_df.columns = ['Major Code', 'Major', 'Major Category']
majors_df

Unnamed: 0,Major Code,Major,Major Category
0,1100,GENERAL AGRICULTURE,Agriculture & Natural Resources
1,1101,AGRICULTURE PRODUCTION AND MANAGEMENT,Agriculture & Natural Resources
2,1102,AGRICULTURAL ECONOMICS,Agriculture & Natural Resources
3,1103,ANIMAL SCIENCES,Agriculture & Natural Resources
4,1104,FOOD SCIENCE,Agriculture & Natural Resources
...,...,...,...
169,5504,GEOGRAPHY,Social Science
170,5505,INTERNATIONAL RELATIONS,Social Science
171,5506,POLITICAL SCIENCE AND GOVERNMENT,Social Science
172,5507,SOCIOLOGY,Social Science


In [64]:
# check if majors code and majors are unique
major_codes = majors_df.iloc[:, 0]
majors = majors_df.iloc[:, 1]

major_codes.is_unique, majors.is_unique

(True, True)

In [65]:
# convert Major column to lowercase (except the first letter)
majors_df['Major'] = majors_df['Major'].str.lower()
majors_df['Major'] = majors_df['Major'].str.capitalize()
majors_df

Unnamed: 0,Major Code,Major,Major Category
0,1100,General agriculture,Agriculture & Natural Resources
1,1101,Agriculture production and management,Agriculture & Natural Resources
2,1102,Agricultural economics,Agriculture & Natural Resources
3,1103,Animal sciences,Agriculture & Natural Resources
4,1104,Food science,Agriculture & Natural Resources
...,...,...,...
169,5504,Geography,Social Science
170,5505,International relations,Social Science
171,5506,Political science and government,Social Science
172,5507,Sociology,Social Science


In [66]:
# open students.csv as a pandas dataframe
students_df= pd.read_csv('students.csv')
students_df

Unnamed: 0,Student ID,First Name,Last Name,Email,Gender,Major,University
0,10613065,Inga,Zeeba,inga.zeeba@gmail.com,Female,international studies,University of Wollongong
1,72356025,Kaja,Mandler,kaja.mandler@gmail.com,Female,finance,Western Sydney University
2,15140208,Doro,Georas,doro.georas@gmail.com,Female,computer science,La Trobe University
3,63401082,Merry,Bennie,merry.bennie@gmail.com,Female,business information systems,Macquarie University
4,91040330,Luci,Bord,luci.bord@gmail.com,Male,software development,Charles Sturt University
...,...,...,...,...,...,...,...
9995,95259310,Kimberley,Jalbert,kimberley.jalbert@gmail.com,Male,ecometrics,University of Wollongong
9996,94143698,Aigneis,Rogerio,aigneis.rogerio@gmail.com,Female,accounting,Western Sydney University
9997,81242115,Hermione,Agle,hermione.agle@gmail.com,Female,gender studies,Charles Sturt University
9998,16376229,Melodie,Thunell,melodie.thunell@gmail.com,Female,game design,Macquarie University


In [67]:
# get a list of universities from universities dataframe
universities.tolist()

['Princeton University',
 'Harvard University',
 'University of Chicago',
 'Yale University',
 'Columbia University',
 'Stanford University',
 'Massachusetts Institute of Technology',
 'Duke University',
 'University of Pennsylvania',
 'Johns Hopkins University',
 'Dartmouth College',
 'California Institute of Technology',
 'Northwestern University',
 'Brown University',
 'Cornell University',
 'Rice University',
 'University of Notre Dame',
 'Vanderbilt University',
 'Washington University in St. Louis',
 'Emory University',
 'Georgetown University',
 'University of California, Berkeley',
 'University of Southern California',
 'Carnegie Mellon University',
 'University of California, Los Angeles',
 'University of Virginia',
 'Tufts University',
 'University of Michigan, Ann Arbor',
 'Wake Forest University',
 'University of North Carolina, Chapel Hill',
 'Boston College',
 'College of William & Mary',
 'University of Rochester',
 'Brandeis University',
 'Georgia Institute of Technolog

In [68]:
# get a list of majors from majors dataframe
majors.tolist()

['General agriculture',
 'Agriculture production and management',
 'Agricultural economics',
 'Animal sciences',
 'Food science',
 'Plant science and agronomy',
 'Soil science',
 'Miscellaneous agriculture',
 'Forestry',
 'Natural resources management',
 'Fine arts',
 'Drama and theater arts',
 'Music',
 'Visual and performing arts',
 'Commercial art and graphic design',
 'Film video and photographic arts',
 'Studio arts',
 'Miscellaneous fine arts',
 'Environmental science',
 'Biology',
 'Biochemical sciences',
 'Botany',
 'Molecular biology',
 'Ecology',
 'Genetics',
 'Microbiology',
 'Pharmacology',
 'Physiology',
 'Zoology',
 'Neuroscience',
 'Miscellaneous biology',
 'Cognitive science and biopsychology',
 'General business',
 'Accounting',
 'Actuarial science',
 'Business management and administration',
 'Operations logistics and e-commerce',
 'Business economics',
 'Marketing and marketing research',
 'Finance',
 'Human resources and personnel management',
 'International busine

In [69]:
# replace all majors and universities in Major and University columns in students.csv with universities and majors lists
students_df['Major'] = np.random.choice(majors, size=len(students_df))
students_df['University'] = np.random.choice(universities, size=len(students_df))
students_df

Unnamed: 0,Student ID,First Name,Last Name,Email,Gender,Major,University
0,10613065,Inga,Zeeba,inga.zeeba@gmail.com,Female,Architectural engineering,University of Vermont
1,72356025,Kaja,Mandler,kaja.mandler@gmail.com,Female,Teacher education: multiple levels,Union University
2,15140208,Doro,Georas,doro.georas@gmail.com,Female,Miscellaneous biology,University of Alabama
3,63401082,Merry,Bennie,merry.bennie@gmail.com,Female,Liberal arts,University of Pennsylvania
4,91040330,Luci,Bord,luci.bord@gmail.com,Male,Business economics,College of William & Mary
...,...,...,...,...,...,...,...
9995,95259310,Kimberley,Jalbert,kimberley.jalbert@gmail.com,Male,Engineering mechanics physics and science,St. John Fisher College
9996,94143698,Aigneis,Rogerio,aigneis.rogerio@gmail.com,Female,Engineering technologies,University of Louisville
9997,81242115,Hermione,Agle,hermione.agle@gmail.com,Female,Forestry,Widener University
9998,16376229,Melodie,Thunell,melodie.thunell@gmail.com,Female,Human services and community organization,"University of North Carolina, Chapel Hill"


In [71]:
# export Dataframes to CSV files
uni_df.to_csv('national_university_ranking.csv', encoding='utf-8', index=False)
majors_df.to_csv('majors.csv', encoding='utf-8', index=False)
students_df.to_csv('top_students.csv', encoding='utf-8', index=False)