## Processing pandas function walkthrough

In [152]:
import pandas as pd
import os
import json

In [153]:
# %load extract_skills
import spacy
from negspacy.negation import Negex
from spacy import displacy


def extract_skills(description, inSkillsDict):
    """
    Runs an openAI API query to return a list of digital skills from a given course description

    :param description: a string detailing what a bootcamp course offers
    :type description: string
    :return: list of digital skills included in the course description
    :rtype: list
    """

    if type(description) != str:
        raise Exception("Input must be str")

    nlp = spacy.load("en_core_web_md")
    nlp.add_pipe("negex", after="ner", config={"ent_types": ["SKILL"]})
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    patterns = []
    full_patterns = []

    for skill in inSkillsDict:
        patterns.extend(inSkillsDict[skill])

    words_list = [word.split() for word in patterns]

    for words in words_list:
        pattern = [{"LOWER": word.lower()} for word in words]
        full_patterns.append(pattern)

    final_patterns = [{"label": "SKILL", "pattern": pattern} for pattern in full_patterns]

    ruler.add_patterns(final_patterns)

    doc = nlp(description)

    words_to_remove = ["Unlike"]

    filtered_sents = []

    for sent in doc.sents:
        if not any(word in sent.text for word in words_to_remove):
            filtered_sents.append(sent.text)

    filtered_doc = nlp(" ".join(filtered_sents))

    result = []

    for ent in filtered_doc.ents:
        if ent.label_ == "SKILL" and ent._.negex == False:
            result.append(ent.text)


    return result




### 1. Define test skills dictionary

This is a shortened example of a dictionary of skills and synonyms to extract data from

In [154]:
test_dict = {"JavaScript": ["Javascript"], "Angular": ["Angular"], "Ruby on Rails": ["Ruby on Rails"], "React": ["react", "react.js", "reactjs"], "Python": ["Python"], "Django": ["Django"], "Express": ["Express"], "Node.js": ["Node.js"], "SQL": ["SQL"], "Excel": ["Excel"], "PowerBI": ["PowerBI"], "Tableau": ["Tableau"], "HTML": ["HTML"], "Devops": ["Devops"], "CSS": ["CSS"]}


In [155]:
test_str = 'JavaScript is no longer taught on this course, but Python is'

test_str2 = 'Unlike other courses that teach Python, we only teach JavaScript'

test_str3 = 'JavaScript is not taught on this course anymore, but Python is'

test_str4 = 'Students will create two websites (a 1-page website and a 5-page website) over the course of 12 weeks. Students will learn to code in HTML, CSS and Javascript. Students will experience the following: \r\n\r\n-Discover FTP (File Transfer Process) website servers\r\n-Develop link building skills\r\n-Learn the Bootstrap framework for responsive design\r\n-Learn how to font with Awesome icons\r\n-Learn how to use Photoshop\r\n-Learn how to implement contact forms. Unlike our Devops course, we do teach Python'

test_str5 = "Become a software engineer in 13 weeks at our coding bootcamps in Manchester, Leeds, Newcastle, Birmingham and remotely.\r\n\r\nUnlike our Data Engineering bootcamp where you focus specifically on the \"back-end\" of software, or our DevOps Engineering bootcamp that deals specifically with software development and IT operations, our coding bootcamp focuses on building websites and mobile phone apps.\r\n\r\nThe application process takes 2-3 weeks and we would advise you to apply sooner rather than later to give yourself plenty of time to work through the preparation materials.\r\n\r\nApplicants living in England can apply for DfE funding to cover the entire cost of the course. Get in touch to find out if you qualify."

extracted_skills = extract_skills(test_str5, test_dict)


### 2. Read json file into DataFrame

- The file contains a shortened raw data example

In [156]:
unprocessed_dataframe = pd.read_json("./example_data/full_course_data.json")

### 3. Explode provider_courses column to create rows for each course

- `reset_index` creates unique id for each new row

In [157]:
exploded_courses = unprocessed_dataframe.explode('provider_courses').reset_index()

exploded_courses.head()

Unnamed: 0,index,provider_name,provider_locations,provider_tracks,provider_courses,meta
0,0,{Pro}Coders,[West Yorkshire],[Full Stack Developer],"{'course_name': 'Full-Stack Web Development', ...",{'target_url': 'https://www.coursereport.com/s...
1,1,CodeClan,"[Edinburgh, Glasgow]","[Full Stack Developer, UX Design, Data Science...","{'course_name': 'Professional Data Analysis', ...",{'target_url': 'https://www.coursereport.com/s...
2,1,CodeClan,"[Edinburgh, Glasgow]","[Full Stack Developer, UX Design, Data Science...",{'course_name': 'Professional Software Develop...,{'target_url': 'https://www.coursereport.com/s...
3,2,Code Nation,"[Cambridge, Manchester]","[Cyber Security, Full Stack Developer, Mobile ...","{'course_name': 'Master: Coding', 'course_skil...",{'target_url': 'https://www.coursereport.com/s...
4,3,Coders Lab,"[Amsterdam, Brussels, Bucharest, Edinburgh, Gl...","[Full Stack Developer, Data Science, Front End...","{'course_name': 'Automation Tester', 'course_s...",{'target_url': 'https://www.coursereport.com/s...


### 4. Create new DataFrame based on provider_courses column keys

In [158]:
normalised_courses = pd.json_normalize(exploded_courses.provider_courses)

normalised_courses.head()

Unnamed: 0,course_name,course_skills,course_locations,course_description
0,Full-Stack Web Development,"[CSS, HTML, JavaScript, Ruby, Express.js, Fron...",West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
1,Professional Data Analysis,"[SQL, R, Python, Machine Learning, GitHub, Git...",Edinburgh,Transform and future-proof your career by gett...
2,Professional Software Development,"[REST, Python, Express.js, DevOps, Agile, CSS,...","Glasgow, Edinburgh","Our full-time, project-based coding course is ..."
3,Master: Coding,"[CSS, JavaScript, React.js, Agile, Express.js,...",Manchester,The ‘Master Coding’ bootcamp gives students th...
4,Automation Tester,"[Java, Quality Assurance Testing]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...


## 5. Process course_descriptions




In [159]:

def consolidate_desc_into_skills(row):
    extracted_skills = extract_skills(str(row['course_description']), test_dict)
    
    existing_skills = row['course_skills']

    row['course_skills'] = list(set(existing_skills + extracted_skills))

    return row

normalised_courses.apply(lambda x: consolidate_desc_into_skills(x), axis=1)

df = pd.concat([normalised_courses['course_skills'], normalised_courses['course_description']], axis=1)

df.to_csv('./example_extraction.csv')

normalised_courses.drop('course_description', axis=1)


Unnamed: 0,course_name,course_skills,course_locations
0,Full-Stack Web Development,"[Front End, CSS, ReactJS, Rails, Ruby on Rails...",West Yorkshire
1,Professional Data Analysis,"[Data Visualization, R, Data Science, GitHub, ...",Edinburgh
2,Professional Software Development,"[Node.js, Front End, CSS, JavaScript, User Exp...","Glasgow, Edinburgh"
3,Master: Coding,"[Front End, CSS, Xcode, Swift, MongoDB, jQuery...",Manchester
4,Automation Tester,"[Quality Assurance Testing, Java]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li..."
...,...,...,...
164,Full Stack Web Development (1-on-1 online),"[Front End, CSS, Ruby, SQL, HTML, JavaScript]",Online
165,Data Science (Part-time),"[Data Visualization, MySQL, Data Science, Data...",Online
166,Full Stack Development (Full-time),"[React, MySQL, Express, Git, GitHub, React.js,...","London, Barcelona, Online"
167,Full Stack Development (Full Time - in person),"[React, Front End, MySQL, Express, GitHub, Git...","Barcelona, London"


### 6. Combine normalised provider_courses DataFrame with original DataFrame and drop unnecessary columns

- We're removing `provider_locations` as `course_locations` is also provided.

In [160]:
concat_dataframe_with_courses = pd.concat([exploded_courses, normalised_courses], axis=1).drop(['provider_courses', 'provider_locations', 'provider_tracks'], axis=1)

concat_dataframe_with_courses.head()

Unnamed: 0,index,provider_name,meta,course_name,course_skills,course_locations,course_description
0,0,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,"[Front End, CSS, ReactJS, Rails, Ruby on Rails...",West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
1,1,CodeClan,{'target_url': 'https://www.coursereport.com/s...,Professional Data Analysis,"[Data Visualization, R, Data Science, GitHub, ...",Edinburgh,Transform and future-proof your career by gett...
2,1,CodeClan,{'target_url': 'https://www.coursereport.com/s...,Professional Software Development,"[Node.js, Front End, CSS, JavaScript, User Exp...","Glasgow, Edinburgh","Our full-time, project-based coding course is ..."
3,2,Code Nation,{'target_url': 'https://www.coursereport.com/s...,Master: Coding,"[Front End, CSS, Xcode, Swift, MongoDB, jQuery...",Manchester,The ‘Master Coding’ bootcamp gives students th...
4,3,Coders Lab,{'target_url': 'https://www.coursereport.com/s...,Automation Tester,"[Quality Assurance Testing, Java]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...


### 7. Explode course_skills column to add row per skill

- `reset_index` again for unique row ids and then drop unnecessary index columns.

In [161]:
exploded_skills = concat_dataframe_with_courses.explode('course_skills').reset_index().drop(['index', 'level_0'], axis=1)

exploded_skills.head()

Unnamed: 0,provider_name,meta,course_name,course_skills,course_locations,course_description
0,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,Front End,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
1,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,CSS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
2,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,ReactJS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
3,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."
4,{Pro}Coders,{'target_url': 'https://www.coursereport.com/s...,Full-Stack Web Development,Ruby on Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA..."


### 8. Create new DataFrame based on meta column keys

In [162]:
normalised_meta = pd.json_normalize(exploded_skills.meta)

normalised_meta.head()

Unnamed: 0,target_url,timestamp
0,https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
1,https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
2,https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
3,https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
4,https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871


### 9. Combine normalised meta DataFrame with original DataFrame and drop unnecessary column

In [163]:
concat_dataframe_with_meta = pd.concat([exploded_skills, normalised_meta], axis=1).drop('meta', axis=1)

concat_dataframe_with_meta.head()

Unnamed: 0,provider_name,course_name,course_skills,course_locations,course_description,target_url,timestamp
0,{Pro}Coders,Full-Stack Web Development,Front End,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
1,{Pro}Coders,Full-Stack Web Development,CSS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
2,{Pro}Coders,Full-Stack Web Development,ReactJS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
3,{Pro}Coders,Full-Stack Web Development,Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871
4,{Pro}Coders,Full-Stack Web Development,Ruby on Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871


### 10. Map course_locations to list of strings and then explode for a row per location

- Note the block below reassigns itself so run the block above again before you rerun!

In [164]:
concat_dataframe_with_meta['course_locations'] = concat_dataframe_with_meta['course_locations'].map(lambda x: x.split(', '))

exploded_locations = concat_dataframe_with_meta.explode('course_locations')

course_locations = ['Online', 'Bath', 'Birmingham', 'Bristol', 'Buckinghamshire', 'Cambridge', 'Edinburgh', 'Glasgow', 'Leeds', 'Liverpool', 'London', 'Manchester', 'Sheffield', 'Wales', 'West Yorkshire']

exploded_locations_filtered = exploded_locations[exploded_locations['course_locations'].isin(course_locations)]

exploded_locations_filtered.loc[:,('course_country',)] = 'UK'

exploded_locations_filtered.head()

Unnamed: 0,provider_name,course_name,course_skills,course_locations,course_description,target_url,timestamp,course_country
0,{Pro}Coders,Full-Stack Web Development,Front End,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871,UK
1,{Pro}Coders,Full-Stack Web Development,CSS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871,UK
2,{Pro}Coders,Full-Stack Web Development,ReactJS,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871,UK
3,{Pro}Coders,Full-Stack Web Development,Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871,UK
4,{Pro}Coders,Full-Stack Web Development,Ruby on Rails,West Yorkshire,"The course covers Ruby on Rails, HTML, CSS, SA...",https://www.coursereport.com/schools/pro-coders,2023-06-27 17:24:33.149871,UK


### 11. Create dataframe of all unique skills

In [165]:
skills_df = normalised_courses['course_skills'].explode('course_skills').drop_duplicates().reset_index().drop('index', axis=1)

skills_df.head()

Unnamed: 0,course_skills
0,Front End
1,CSS
2,ReactJS
3,Rails
4,Ruby on Rails


### You can output to a csv by running the block below:

In [166]:
exploded_locations_filtered.to_csv("./processed_data.csv")
skills_df.to_csv("./processed_skills_data.csv")