## Processing pandas function walkthrough

In [472]:
# %load extract_skills
import pandas as pd
import spacy
from spacy.matcher import Matcher
import json

def extract_skills(description):
    """
    Runs an openAI API query to return a list of digital skills from a given course description

    :param description: a string detailing what a bootcamp course offers
    :type description: string
    :return: list of digital skills included in the course description
    :rtype: list
    """

    if type(description) != str:
        raise Exception("Input must be str")

    nlp = spacy.load("en_core_web_md")

    doc = nlp(description)

    matcher = Matcher(nlp.vocab)
    with open('../processor_course_report/skill_deduper/skills_dict.json', 'r') as f:
        skill_dict = json.load(f)

    skills_list = []

    for skill in skill_dict:
        skills_list.extend(skill_dict[skill])

    pattern_list = []
    proper_nouns = []
    nouns = []

    for skill in skills_list:
        tokens = skill.split()
        proper_nouns.append([{"LOWER": token, "POS": "PROPN"} for token in tokens])
        nouns.append([{"LOWER": token, "POS": "NOUN"} for token in tokens])

    pattern_list.extend(proper_nouns)
    pattern_list.extend(nouns)

    matcher.add("SKILL", pattern_list, greedy="LONGEST")
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])

    skills = [doc[match[1]:match[2]].text for match in matches]

    return skills

### 1. Read json file into DataFrame

- The file contains a shortened raw data example

In [473]:
unprocessed_dataframe = pd.read_json("./example_data/raw_course_data.json")

### 2. Explode provider_courses column to create rows for each course

- `reset_index` creates unique id for each new row

In [474]:
exploded_courses = unprocessed_dataframe.explode('provider_courses').reset_index()

exploded_courses.head()

Unnamed: 0,index,provider_name,provider_locations,provider_tracks,provider_courses,meta
0,0,Codez Academy,[Wales],[Front End Developer],"{'course_name': 'Digital Roots Scheme', 'cours...",{'target_url': 'https://www.coursereport.com/s...
1,0,Codez Academy,[Wales],[Front End Developer],{'course_name': 'Foundation Course in Front-en...,{'target_url': 'https://www.coursereport.com/s...
2,1,School of Code,"[Birmingham, Liverpool, London, Manchester]",[Full Stack Developer],"{'course_name': 'School of Code Bootcamp', 'co...",{'target_url': 'https://www.coursereport.com/s...
3,2,Coders Lab,"[Amsterdam, Brussels, Bucharest, Edinburgh, Gl...","[Full Stack Developer, Data Science, Front End...","{'course_name': 'Automation Tester', 'course_s...",{'target_url': 'https://www.coursereport.com/s...
4,2,Coders Lab,"[Amsterdam, Brussels, Bucharest, Edinburgh, Gl...","[Full Stack Developer, Data Science, Front End...","{'course_name': 'Java Developer', 'course_skil...",{'target_url': 'https://www.coursereport.com/s...


### 3. Create new DataFrame based on provider_courses column keys

In [475]:
normalised_courses = pd.json_normalize(exploded_courses.provider_courses)

normalised_courses.head()

Unnamed: 0,course_name,course_skills,course_locations,course_description
0,Digital Roots Scheme,"[HTML, JavaScript, CSS]",Wales,"Students will learn three main languages: CSS,..."
1,Foundation Course in Front-end Development,"[HTML, CSS, JavaScript]",Wales,Students will create two websites (a 1-page we...
2,School of Code Bootcamp,"[C#, JavaScript, CSS, Design, Express.js, Fron...",Birmingham,Learn Full-Stack Web Development on our 16 wee...
3,Automation Tester,"[Java, Quality Assurance Testing]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...
4,Java Developer,"[Java, JavaScript, jQuery, MySQL, SQL, Git, Gi...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla...","During the course, you will learn Java for Int..."


## 3a. Process course_descriptions




In [476]:

def consolidate_desc_into_skills(row):
    extracted_skills = extract_skills(str(row['course_description']))
    
    existing_skills = row['course_skills']

    row['course_skills'] = list(set(existing_skills + extracted_skills))

    return row

normalised_courses.apply(lambda x: consolidate_desc_into_skills(x), axis=1)
normalised_courses.drop('course_description', axis=1)


[[{'LOWER': 'agile', 'POS': 'PROPN'}], [{'LOWER': 'agile', 'POS': 'PROPN'}, {'LOWER': 'development', 'POS': 'PROPN'}], [{'LOWER': 'agile', 'POS': 'PROPN'}, {'LOWER': 'methodology', 'POS': 'PROPN'}], [{'LOWER': 'agile', 'POS': 'PROPN'}, {'LOWER': 'software', 'POS': 'PROPN'}, {'LOWER': 'development', 'POS': 'PROPN'}], [{'LOWER': 'amazon', 'POS': 'PROPN'}, {'LOWER': 'ecs', 'POS': 'PROPN'}], [{'LOWER': 'ecs', 'POS': 'PROPN'}], [{'LOWER': 'angular', 'POS': 'PROPN'}], [{'LOWER': 'angular.js', 'POS': 'PROPN'}], [{'LOWER': 'angularjs', 'POS': 'PROPN'}], [{'LOWER': 'ansible', 'POS': 'PROPN'}], [{'LOWER': 'apigee', 'POS': 'PROPN'}], [{'LOWER': 'asana', 'POS': 'PROPN'}], [{'LOWER': 'assembly', 'POS': 'PROPN'}], [{'LOWER': 'atom', 'POS': 'PROPN'}], [{'LOWER': 'asp.net', 'POS': 'PROPN'}], [{'LOWER': 'amazon', 'POS': 'PROPN'}, {'LOWER': 'web', 'POS': 'PROPN'}, {'LOWER': 'services', 'POS': 'PROPN'}], [{'LOWER': 'aws', 'POS': 'PROPN'}], [{'LOWER': 'azure', 'POS': 'PROPN'}], [{'LOWER': 'microsoft', 'PO

Unnamed: 0,course_name,course_skills,course_locations
0,Digital Roots Scheme,"[CSS, JavaScript, HTML]",Wales
1,Foundation Course in Front-end Development,"[CSS, JavaScript, HTML]",Wales
2,School of Code Bootcamp,"[MongoDB, CSS, Node.js, Git, Data Structures, ...",Birmingham
3,Automation Tester,"[Quality Assurance Testing, Java]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li..."
4,Java Developer,"[Git, jQuery, Java, JavaScript, SQL, HTML, MyS...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla..."
5,JavaScript Developer,"[MongoDB, CSS, Node.js, jQuery, JavaScript, HT...","Warsaw, Jakarta, Madrid, Kraków, Online, Vienn..."
6,Manual Tester,"[CSS, Software Testing, Scrum, Agile, HTML, Li...","Warsaw, Jakarta, Kraków, Online, Vienna, Edinb..."
7,Python Developer,"[CSS, Django, Git, jQuery, JavaScript, HTML, M...","Warsaw, Jakarta, Kraków, Online, Edinburgh, Gl..."
8,ENTRY-LEVEL CLOUD ENGINEER,"[MongoDB, CSS, Node.js, Express.js, Scrum, jQu...",Online
9,ENTRY-LEVEL SOFTWARE ENGINEER,"[Cloud Computing, Java, JavaScript, HTML, Fron...",Online


### 4. Combine normalised provider_courses DataFrame with original DataFrame and drop unnecessary columns

- We're removing `provider_locations` as `course_locations` is also provided.

In [477]:
concat_dataframe_with_courses = pd.concat([exploded_courses, normalised_courses], axis=1).drop(['provider_courses', 'provider_locations', 'provider_tracks'], axis=1)

concat_dataframe_with_courses.head()

Unnamed: 0,index,provider_name,meta,course_name,course_skills,course_locations,course_description
0,0,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,"[CSS, JavaScript, HTML]",Wales,"Students will learn three main languages: CSS,..."
1,0,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,"[CSS, JavaScript, HTML]",Wales,Students will create two websites (a 1-page we...
2,1,School of Code,{'target_url': 'https://www.coursereport.com/s...,School of Code Bootcamp,"[MongoDB, CSS, Node.js, Git, Data Structures, ...",Birmingham,Learn Full-Stack Web Development on our 16 wee...
3,2,Coders Lab,{'target_url': 'https://www.coursereport.com/s...,Automation Tester,"[Quality Assurance Testing, Java]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...
4,2,Coders Lab,{'target_url': 'https://www.coursereport.com/s...,Java Developer,"[Git, jQuery, Java, JavaScript, SQL, HTML, MyS...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla...","During the course, you will learn Java for Int..."


### 5. Explode course_skills column to add row per skill

- `reset_index` again for unique row ids and then drop unnecessary index columns.

In [478]:
exploded_skills = concat_dataframe_with_courses.explode('course_skills').reset_index().drop(['index', 'level_0'], axis=1)

exploded_skills.head()

Unnamed: 0,provider_name,meta,course_name,course_skills,course_locations,course_description
0,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,..."
1,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,..."
2,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,..."
3,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...
4,Codez Academy,{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,JavaScript,Wales,Students will create two websites (a 1-page we...


### 6. Create new DataFrame based on meta column keys

In [479]:
normalised_meta = pd.json_normalize(exploded_skills.meta)

normalised_meta.head()

Unnamed: 0,target_url,timestamp
0,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### 7. Combine normalised meta DataFrame with original DataFrame and drop unnecessary column

In [480]:
concat_dataframe_with_meta = pd.concat([exploded_skills, normalised_meta], axis=1).drop('meta', axis=1)

concat_dataframe_with_meta.head()

Unnamed: 0,provider_name,course_name,course_skills,course_locations,course_description,target_url,timestamp
0,Codez Academy,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,Codez Academy,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,Codez Academy,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,Codez Academy,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,Codez Academy,Foundation Course in Front-end Development,JavaScript,Wales,Students will create two websites (a 1-page we...,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### 8. Map course_locations to list of strings and then explode for a row per location

- Note the block below reassigns itself so run the block above again before you rerun!

In [481]:
concat_dataframe_with_meta['course_locations'] = concat_dataframe_with_meta['course_locations'].map(lambda x: x.split(', '))

exploded_locations = concat_dataframe_with_meta.explode('course_locations')

course_locations = ['Online', 'Bath', 'Birmingham', 'Bristol', 'Buckinghamshire', 'Cambridge', 'Edinburgh', 'Glasgow', 'Leeds', 'Liverpool', 'London', 'Manchester', 'Sheffield', 'Wales', 'West Yorkshire']

exploded_locations_filtered = exploded_locations[exploded_locations['course_locations'].isin(course_locations)]

exploded_locations_filtered.loc[:,('course_country',)] = 'UK'

exploded_locations_filtered.head()

Unnamed: 0,provider_name,course_name,course_skills,course_locations,course_description,target_url,timestamp,course_country
0,Codez Academy,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105,UK
1,Codez Academy,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105,UK
2,Codez Academy,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105,UK
3,Codez Academy,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105,UK
4,Codez Academy,Foundation Course in Front-end Development,JavaScript,Wales,Students will create two websites (a 1-page we...,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105,UK


### 9. Create dataframe of all unique skills

In [482]:
skills_df = normalised_courses['course_skills'].explode('course_skills').drop_duplicates().reset_index().drop('index', axis=1)

skills_df.head()

Unnamed: 0,course_skills
0,CSS
1,JavaScript
2,HTML
3,MongoDB
4,Node.js


### You can output to a csv by running the block below:

In [483]:
exploded_locations_filtered.to_csv("./processed_data.csv")
skills_df.to_csv("./processed_skills_data.csv")