## Processing pandas function walkthrough

In [80]:
import pandas as pd
from app import find_time_commitment


### 1. Read json file into DataFrame

In [81]:
unprocessed_dataframe = pd.read_json("../raw_course_data.json")

### 2. Explode provider_courses column to create rows for each course

- `reset_index` creates unique id for each new row

In [82]:
exploded_courses = unprocessed_dataframe.explode('provider_courses').reset_index()

exploded_courses.head()

Unnamed: 0,index,provider_name,provider_locations,provider_tracks,provider_courses,meta
0,0,Codez Academy,[Wales],[Front End Developer],"{'course_name': 'Digital Roots Scheme', 'cours...",{'target_url': 'https://www.coursereport.com/s...
1,0,Codez Academy,[Wales],[Front End Developer],{'course_name': 'Foundation Course in Front-en...,{'target_url': 'https://www.coursereport.com/s...
2,1,School of Code,"[Birmingham, Liverpool, London, Manchester]",[Full Stack Developer],"{'course_name': 'School of Code Bootcamp', 'co...",{'target_url': 'https://www.coursereport.com/s...
3,2,Coders Lab,"[Amsterdam, Brussels, Bucharest, Edinburgh, Gl...","[Full Stack Developer, Data Science, Front End...","{'course_name': 'Automation Tester', 'course_s...",{'target_url': 'https://www.coursereport.com/s...
4,2,Coders Lab,"[Amsterdam, Brussels, Bucharest, Edinburgh, Gl...","[Full Stack Developer, Data Science, Front End...","{'course_name': 'Java Developer', 'course_skil...",{'target_url': 'https://www.coursereport.com/s...


### 3. Create new DataFrame based on provider_courses column keys

In [83]:
normalised_courses = pd.json_normalize(exploded_courses.provider_courses)

normalised_courses.head()

Unnamed: 0,course_name,course_skills,course_locations,course_description
0,Digital Roots Scheme,"[HTML, JavaScript, CSS]",Wales,"Students will learn three main languages: CSS,..."
1,Foundation Course in Front-end Development,"[HTML, CSS, JavaScript]",Wales,Students will create two websites (a 1-page we...
2,School of Code Bootcamp,"[C#, JavaScript, CSS, Design, Express.js, Fron...",Birmingham,Learn Full-Stack Web Development on our 16 wee...
3,Automation Tester,"[Java, Quality Assurance Testing]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...
4,Java Developer,"[Java, JavaScript, jQuery, MySQL, SQL, Git, Gi...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla...","During the course, you will learn Java for Int..."


### 4. Combine normalised provider_courses DataFrame with original DataFrame and drop unnecessary columns

- We're removing `provider_locations` as `course_locations` is also provided.

In [84]:
concat_dataframe_with_courses = pd.concat([exploded_courses, normalised_courses], axis=1).drop(['provider_courses', 'provider_locations'], axis=1)

concat_dataframe_with_courses.head()

Unnamed: 0,index,provider_name,provider_tracks,meta,course_name,course_skills,course_locations,course_description
0,0,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,"[HTML, JavaScript, CSS]",Wales,"Students will learn three main languages: CSS,..."
1,0,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,"[HTML, CSS, JavaScript]",Wales,Students will create two websites (a 1-page we...
2,1,School of Code,[Full Stack Developer],{'target_url': 'https://www.coursereport.com/s...,School of Code Bootcamp,"[C#, JavaScript, CSS, Design, Express.js, Fron...",Birmingham,Learn Full-Stack Web Development on our 16 wee...
3,2,Coders Lab,"[Full Stack Developer, Data Science, Front End...",{'target_url': 'https://www.coursereport.com/s...,Automation Tester,"[Java, Quality Assurance Testing]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...
4,2,Coders Lab,"[Full Stack Developer, Data Science, Front End...",{'target_url': 'https://www.coursereport.com/s...,Java Developer,"[Java, JavaScript, jQuery, MySQL, SQL, Git, Gi...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla...","During the course, you will learn Java for Int..."


### 5. Add new column containing time commitment based on course_name column

- Uses `find_time_commitment` util found in `app.py`

In [85]:
concat_dataframe_with_courses['time'] = concat_dataframe_with_courses.apply(lambda x: find_time_commitment(x), axis=1)

concat_dataframe_with_courses.head()

Unnamed: 0,index,provider_name,provider_tracks,meta,course_name,course_skills,course_locations,course_description,time
0,0,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,"[HTML, JavaScript, CSS]",Wales,"Students will learn three main languages: CSS,...",
1,0,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,"[HTML, CSS, JavaScript]",Wales,Students will create two websites (a 1-page we...,
2,1,School of Code,[Full Stack Developer],{'target_url': 'https://www.coursereport.com/s...,School of Code Bootcamp,"[C#, JavaScript, CSS, Design, Express.js, Fron...",Birmingham,Learn Full-Stack Web Development on our 16 wee...,
3,2,Coders Lab,"[Full Stack Developer, Data Science, Front End...",{'target_url': 'https://www.coursereport.com/s...,Automation Tester,"[Java, Quality Assurance Testing]","Warsaw, Online, Kraków, Edinburgh, Glasgow, Li...",Do you think that if you have yet no clue abou...,
4,2,Coders Lab,"[Full Stack Developer, Data Science, Front End...",{'target_url': 'https://www.coursereport.com/s...,Java Developer,"[Java, JavaScript, jQuery, MySQL, SQL, Git, Gi...","Warsaw, Vienna, Kraków, Online, Edinburgh, Gla...","During the course, you will learn Java for Int...",


### 6. Explode course_skills column to add row per skill

- `reset_index` again for unique row ids and then drop unnecessary index columns.

In [86]:
exploded_skills = concat_dataframe_with_courses.explode('course_skills').reset_index().drop(['index', 'level_0'], axis=1)

exploded_skills.head()

Unnamed: 0,provider_name,provider_tracks,meta,course_name,course_skills,course_locations,course_description,time
0,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",
1,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",
2,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",
3,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,HTML,Wales,Students will create two websites (a 1-page we...,
4,Codez Academy,[Front End Developer],{'target_url': 'https://www.coursereport.com/s...,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,


### 7. Create new DataFrame based on meta column keys

In [87]:
normalised_meta = pd.json_normalize(exploded_skills.meta)

normalised_meta.head()

Unnamed: 0,target_url,timestamp
0,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### 8. Combine normalised meta DataFrame with original DataFrame and drop unnecessary column

In [88]:
concat_dataframe_with_meta = pd.concat([exploded_skills, normalised_meta], axis=1).drop('meta', axis=1)

concat_dataframe_with_meta.head()

Unnamed: 0,provider_name,provider_tracks,course_name,course_skills,course_locations,course_description,time,target_url,timestamp
0,Codez Academy,[Front End Developer],Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,Codez Academy,[Front End Developer],Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,Codez Academy,[Front End Developer],Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,Codez Academy,[Front End Developer],Foundation Course in Front-end Development,HTML,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,Codez Academy,[Front End Developer],Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### 9. Explode provider_tracks column to add a row per track

- `reset_index` again and remove unnecessary index column

In [89]:
exploded_tracks = concat_dataframe_with_meta.explode('provider_tracks').reset_index().drop(['index'], axis=1)

exploded_tracks.head()

Unnamed: 0,provider_name,provider_tracks,course_name,course_skills,course_locations,course_description,time,target_url,timestamp
0,Codez Academy,Front End Developer,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,Codez Academy,Front End Developer,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,Codez Academy,Front End Developer,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,Codez Academy,Front End Developer,Foundation Course in Front-end Development,HTML,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,Codez Academy,Front End Developer,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### 10. Map course_locations to list of strings then explode for a row per location

- Note the block below reassigns itself so run the block above again before you rerun!

In [90]:
exploded_tracks['course_locations'] = exploded_tracks['course_locations'].map(lambda x: x.split(', '))

exploded_locations = exploded_tracks.explode('course_locations')

exploded_locations.head()

Unnamed: 0,provider_name,provider_tracks,course_name,course_skills,course_locations,course_description,time,target_url,timestamp
0,Codez Academy,Front End Developer,Digital Roots Scheme,HTML,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
1,Codez Academy,Front End Developer,Digital Roots Scheme,JavaScript,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
2,Codez Academy,Front End Developer,Digital Roots Scheme,CSS,Wales,"Students will learn three main languages: CSS,...",,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
3,Codez Academy,Front End Developer,Foundation Course in Front-end Development,HTML,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105
4,Codez Academy,Front End Developer,Foundation Course in Front-end Development,CSS,Wales,Students will create two websites (a 1-page we...,,https://www.coursereport.com/schools/codez-aca...,2023-05-25 14:12:01.312105


### You can output to a csv by running the block below:

In [91]:
exploded_locations.to_csv("./processed_data.csv")