In [1]:
import pandas as pd
import string

In [2]:
#Changed this option so that I could see the full descriptions
#Otherwise they're abbreviated, since they're so long
pd.set_option('display.max_colwidth', None)

In [9]:
#Read in cleaned data from pickle file
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\cleaned_data.pkl"
df = pd.read_pickle(path)

In [10]:
df.head(2)

Unnamed: 0,Course,Name,Description,Original Description
0,ANT 101,Introduction to Anthropology,"[introduction, anthropology, human, culture, past, present, comparative, subsistence, regime, economics, stratification, political, organization, marriage, kinship, culture, religion, social, cultural, change]","Introduction to anthropology - the study of human cultures in the past and present. Topics include the comparative study of subsistence regimes and economics, stratification and political organization, marriage and kinship, culture, religion, and social and cultural change. 4 credits."
1,ANT 105,World Prehistory,"[introduction, physical, anthropology, archaeology, human, evolution, human, physical, variation, peopling, world, origin, food, production, comparison, complex, society, around, world]","An introduction to physical anthropology and archaeology. Topics include human evolution, human physical variation, the peopling of the world, the origins of food production and a comparison of complex societies around the world."


In [11]:
#Make a copy of the dataframe to start splitting it up by subject
subject_df = df.copy()

In [12]:
#Split course column into subject and course number and create new columns
split_course = subject_df["Course"].str.split(" ", n=1, expand=True)
subject_df.insert(0, "Subject", split_course[0])
subject_df.insert(1, "Course Number", split_course[1])
subject_df.head(2)

Unnamed: 0,Subject,Course Number,Course,Name,Description,Original Description
0,ANT,101,ANT 101,Introduction to Anthropology,"[introduction, anthropology, human, culture, past, present, comparative, subsistence, regime, economics, stratification, political, organization, marriage, kinship, culture, religion, social, cultural, change]","Introduction to anthropology - the study of human cultures in the past and present. Topics include the comparative study of subsistence regimes and economics, stratification and political organization, marriage and kinship, culture, religion, and social and cultural change. 4 credits."
1,ANT,105,ANT 105,World Prehistory,"[introduction, physical, anthropology, archaeology, human, evolution, human, physical, variation, peopling, world, origin, food, production, comparison, complex, society, around, world]","An introduction to physical anthropology and archaeology. Topics include human evolution, human physical variation, the peopling of the world, the origins of food production and a comparison of complex societies around the world."


In [13]:
#Drop original course column
subject_df.drop(columns = ["Course"], inplace=True)
subject_df.head(2)

Unnamed: 0,Subject,Course Number,Name,Description,Original Description
0,ANT,101,Introduction to Anthropology,"[introduction, anthropology, human, culture, past, present, comparative, subsistence, regime, economics, stratification, political, organization, marriage, kinship, culture, religion, social, cultural, change]","Introduction to anthropology - the study of human cultures in the past and present. Topics include the comparative study of subsistence regimes and economics, stratification and political organization, marriage and kinship, culture, religion, and social and cultural change. 4 credits."
1,ANT,105,World Prehistory,"[introduction, physical, anthropology, archaeology, human, evolution, human, physical, variation, peopling, world, origin, food, production, comparison, complex, society, around, world]","An introduction to physical anthropology and archaeology. Topics include human evolution, human physical variation, the peopling of the world, the origins of food production and a comparison of complex societies around the world."


In [14]:
#Drop original description column
subject_df.drop(columns = ["Original Description"], inplace=True)
subject_df.head(1)

Unnamed: 0,Subject,Course Number,Name,Description
0,ANT,101,Introduction to Anthropology,"[introduction, anthropology, human, culture, past, present, comparative, subsistence, regime, economics, stratification, political, organization, marriage, kinship, culture, religion, social, cultural, change]"


In [15]:
#Group the df by subject, with one subject per row and all descriptions combined into lists
subject_df = subject_df.groupby("Subject").agg({"Description": lambda x: x.tolist()})
subject_df = subject_df.reset_index()
subject_df.head(2)

Unnamed: 0,Subject,Description
0,ACC,"[[theory, practice, accounting, applied, corporate, form, business, organization, analysis, business, transaction, valuation, asset, liability, determination, income, preparation, interpretation, financial, statement], [use, accounting, information, management, planning, control, decision, making, business, enterprise, production, decision, activity, based, costing, budgeting, standard, capital, investment, decision], [depth, financial, accounting, standard, related, presenting, income, statement, balance, sheet, business, enterprise, liability, current, asset, revenue, recognition, time, value, money, concept, begin, understand, significant, judgment, involved, application, accounting, standard, also, recognize, importance, ethic, accounting, decision, making, significant, service, learning, project, required, component], [depth, financial, accounting, standard, related, presenting, enterprise, balance, sheet, income, statement, statement, cash, flow, investment, derivative, long, term, asset, liability, ethical, decision, making, context, accounting, decision, making], [collection, utilization, cost, data, management, short, term, planning, control, purpose, cost, volume, profit, relationship, product, costing, method, flexible, budget, standard, cost, variance, cost, allocation], [intensive, conceptual, applied, introduction, auditing, assurance, society, emphasis, knowledge, assurance, service, well, skill, attitude, required, success, accounting, profession, focus, financial, statement, audit, well, assurance, service], [current, federal, income, tax, law, pertaining, individual, partnership, corporation, focus, compliance, matter, supplementary, tax, planning, research, fundamental], [examining, analyzing, system, process, accounting, information, using, transaction, cycle, manual, automated, environment, focus, design, development, implememntation, system, relevant, process, control, technology], [intermediate, level, international, accounting, overview, area, international, accounting, focusing, accounting, issue, encountered, multinational, company, engaged, international, trade, invested, foreign, operation, accounting, issue, unique, multinational, corporation, respect, foreign, operation, various, functional, area, accounting, country, world], [depth, financial, accounting, standard, related, presenting, enterprise, balance, sheet, income, statement, statement, cash, flow, liability, shareholder, equity, share, based, compensation, accounting, change, error, significant, service, learning, project, required, component], [selected, field, accounting, covered], [major, may, pursue, independent, project, sooperation, member, department, type, project, vary, interest, individual], [individualized, internship, approved, designated, faculty, sponsor, develop, objective, internship, experience, read, relevant, literature, prepare, written, assignment, submit, reflective, summary, internship, activity], [use, accounting, information, management, planning, control, decision, making, business, enterprise, production, decision, activity, based, costing, budgeting, standard, capital, investment, decision], [intermediate, level, financial, reporting, state, local, government, coverage, accounting, type, non, business, entity, governmental, nonprofit, auditing, cover, fund, accounting, concept, practice, well, government, wide, financial, reporting, relationship, two], [advanced, survey, fraud, examination, forensic, accounting, examining, risk, assessment, internal, control, ethic, corporate, governance, legal, criminal, justice, system, legislation, global, issue, lecture, research, discussion]]"
1,ADC,"[[capstone, experience, african, american, diaspora, culture, intedisciplinary, minor, varied]]"


In [16]:
#flatten the lists of descriptions so they're no longer lists of lists
subject_df["Description"] = subject_df["Description"].apply(lambda x: [item for sublist in x for item in sublist])
subject_df.head(2)

Unnamed: 0,Subject,Description
0,ACC,"[theory, practice, accounting, applied, corporate, form, business, organization, analysis, business, transaction, valuation, asset, liability, determination, income, preparation, interpretation, financial, statement, use, accounting, information, management, planning, control, decision, making, business, enterprise, production, decision, activity, based, costing, budgeting, standard, capital, investment, decision, depth, financial, accounting, standard, related, presenting, income, statement, balance, sheet, business, enterprise, liability, current, asset, revenue, recognition, time, value, money, concept, begin, understand, significant, judgment, involved, application, accounting, standard, also, recognize, importance, ethic, accounting, decision, making, significant, service, learning, project, required, component, depth, financial, accounting, standard, related, presenting, enterprise, balance, sheet, income, statement, statement, cash, flow, investment, derivative, long, term, ...]"
1,ADC,"[capstone, experience, african, american, diaspora, culture, intedisciplinary, minor, varied]"


In [17]:
#import excel file with full subject names since not all abbrevations are obvious
path2 = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\subject_names.xlsx"
subject_names = pd.read_excel(path2)
subject_names.head()

Unnamed: 0,Abbreviation,Full name
0,ACC,Accounting
1,ADC,African American and Diaspora Studies
2,ANT,Anthropology
3,ART,Art
4,AST,Asian Studies


In [18]:
#Convert the df to a dictionary with subject abbreviations as keys and full names as values
subject_dict = subject_names.set_index("Abbreviation").T.to_dict("index")
subject_dict = subject_dict["Full name"]
subject_dict

{'ACC': 'Accounting',
 'ADC': 'African American and Diaspora Studies',
 'ANT': 'Anthropology',
 'ART': 'Art',
 'AST': 'Asian Studies',
 'AS': 'Asian Studies',
 'BIO': 'Biology',
 'BUS': 'Business Administration',
 'CHM': 'Chemistry',
 'CHN': 'Chinese',
 'CLS': 'Classics',
 'COM': 'Communication Studies',
 'CEM': 'Community Engaged Medicine',
 'CSC': 'Computer Science',
 'DSC': 'Data Science',
 'DAN': 'Dance',
 'EES': 'Earth and Environmental Sciences',
 'ECN': 'Economics',
 'EDU': 'Education',
 'EDCI': 'Curriculum and Instruction in Education',
 'EDEC': 'Early Childhood Education',
 'EDEX': 'Exceptionalities in Education',
 'EDEP': 'Extended Program in Education',
 'EDFD': 'Educational Foundations',
 'EDRD': 'Literacy Education',
 'EDMT': 'Master of Arts in Teaching',
 'EDSL': 'School and Educational Leadership',
 'EDSP': 'Special Topics in Education',
 'EDOL': 'Teaching English to Speakers of Other Languages',
 'ENG': 'English',
 'EST': 'Environmental Studies',
 'FST': 'Film Studies',

In [19]:
subject_df.insert(0, "Abbreviation", subject_df["Subject"])
subject_df["Subject"] = subject_df["Abbreviation"].map(subject_dict)
subject_df.head()

Unnamed: 0,Abbreviation,Subject,Description
0,ACC,Accounting,"[theory, practice, accounting, applied, corporate, form, business, organization, analysis, business, transaction, valuation, asset, liability, determination, income, preparation, interpretation, financial, statement, use, accounting, information, management, planning, control, decision, making, business, enterprise, production, decision, activity, based, costing, budgeting, standard, capital, investment, decision, depth, financial, accounting, standard, related, presenting, income, statement, balance, sheet, business, enterprise, liability, current, asset, revenue, recognition, time, value, money, concept, begin, understand, significant, judgment, involved, application, accounting, standard, also, recognize, importance, ethic, accounting, decision, making, significant, service, learning, project, required, component, depth, financial, accounting, standard, related, presenting, enterprise, balance, sheet, income, statement, statement, cash, flow, investment, derivative, long, term, ...]"
1,ADC,African American and Diaspora Studies,"[capstone, experience, african, american, diaspora, culture, intedisciplinary, minor, varied]"
2,ANT,Anthropology,"[introduction, anthropology, human, culture, past, present, comparative, subsistence, regime, economics, stratification, political, organization, marriage, kinship, culture, religion, social, cultural, change, introduction, physical, anthropology, archaeology, human, evolution, human, physical, variation, peopling, world, origin, food, production, comparison, complex, society, around, world, social, anthropological, survey, diverse, people, contemporary, sub, saharan, africa, ethnographic, survey, global, cultural, diversity, case, africa, america, asia, europe, middle, east, may, daily, life, sociocultural, environmental, change, adaptation, migration, politics, work, religion, gender, family, art, among, foraging, agricultural, industrial, society, survey, various, facet, modern, japanese, society, culture, family, community, ethnicity, life, cycle, education, gender, religion, work, popular, culture, well, cultural, aspect, ...]"
3,ARB,Arabic,"[introduction, sound, system, grammatical, structure, necessary, develop, listening, speaking, reading, writing, skill, arabic, continued, development, listening, speaking, reading, writing, skill, arabic]"
4,ART,Art,"[introductory, studio, providing, depth, hand, experience, non, art, major, variety, two, dimensional, three, dimensional, medium, technique, fundamental, two, dimensional, design, explored, lecture, class, project, integration, graphic, design, problem, solving, tool, conceptual, development, formal, element, design, introduced, type, image, creative, software, explore, design, element, principle, traditional, medium, software, exercise, element, principle, three, dimensional, design, lecture, project, explore, concept, working, three, dimensional, format, theory, color, painting, principle, technique, exploration, color, relationship, based, johannes, itten, seven, area, chromatic, contrast, theory, used, specific, exercise, painting, direct, observation, implement, defined, color, theory, concept, drawing, element, art, composition, extensive, exercise, direct, observation, one, two, point, perspective, ...]"


In [20]:
#Sort the words for each subject alphabetically
subject_df["Description"] = subject_df["Description"].apply(lambda x: sorted(x))
subject_df.head(2)

Unnamed: 0,Abbreviation,Subject,Description
0,ACC,Accounting,"[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,ADC,African American and Diaspora Studies,"[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [21]:
#save df to disk as a pickle file
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\subject_df.pkl"
subject_df.to_pickle(path)