# Grab data

In [1]:
import pandas as pd
import lib.cleaners as c
import numpy as np

dataset = pd.read_json('dataset/resume_output_Cook-canada.json', lines=True)
dataset.head(10)
print('Dataset (maybe) duplicated:', len(dataset))

dataset = dataset.drop_duplicates(subset=['id'])
print('Dataset non-duplicated:', len(dataset))

Dataset (maybe) duplicated: 9977
Dataset non-duplicated: 9821


# Expanding education data

Expand education data into multiple rows

In [2]:
edu_df = c.expand_to_multi_rows(dataset, 'schools', 'school')
edu_df = edu_df[['id', 'school']]
edu_df.head(10)

Unnamed: 0,id,school
0,54a3bd53808e6764,"{'degree': '', 'school_name': 'Queen Elizabeth..."
1,54a3bd53808e6764,"{'degree': '', 'school_name': 'Kpu', 'start_da..."
2,f8dc5fa9fc39c387,{'degree': 'High School Diploma in Technical T...
3,0e84aa0202047520,"{'degree': 'Certificate in French Immersion', ..."
4,67b00f0985c8948a,{'degree': 'in Mechanics and German Language S...
5,89c76f9c1242c552,{'degree': 'Professional cooks lvl 1 in Culina...
6,89c76f9c1242c552,"{'degree': 'High school diploma', 'school_nam..."
7,84ebc2bebda72365,"{'degree': 'High School Diploma', 'school_name..."
8,017658f6f804297c,{'degree': 'apprenticehip in the field of cook...
9,b61c261f28ab69be,"{'degree': 'High school', 'school_name': 'Sril..."


Make dataframe out of the education dictionary

In [5]:
degree_df = c.create_new_df_dict_col(edu_df, 'school')

Delete NaN degrees and replace empty strings to NaN

In [6]:
degree_df = degree_df.replace('', np.nan)
degree_df = degree_df.dropna(subset=['degree'])
degree_df.head(10)

Unnamed: 0,degree,school_name,start_date,end_date
2,High School Diploma in Technical Theater,BISHOP CARROLL HIGH SCHOOL,September 2016,Present
3,Certificate in French Immersion,Ecole Connaught Community School,January 2011,
4,in Mechanics and German Language Studies,Hassi Messaoud High School,,
5,Professional cooks lvl 1 in Culinary arts,Collage,May 2013,April 2014
6,High school diploma,Fleetwood park secondary,September 2007,June 2012
7,High School Diploma,Hamilton High School,May 1975,May 1979
8,apprenticehip in the field of cooking in culin...,vancouver island university,June 1989,May 1991
9,High school,Srilanka,,
10,Master's degree in Mechanical engineering (Tec...,Vysoká škola báňská - Technická univerzita Ost...,September 2013,June 2018
11,High school or equivalent,,,


Simplifying degree data (it modifies the given DataFrame)

In [7]:
c.simplify_education_information_(degree_df, 'start_date', 'end_date')
simplified_degree_df = degree_df[['degree', 'degree_year_time']]
simplified_degree_df.head(10)

Unnamed: 0,degree,degree_year_time
2,High School Diploma in Technical Theater,
3,Certificate in French Immersion,
4,in Mechanics and German Language Studies,
5,Professional cooks lvl 1 in Culinary arts,0.917199
6,High school diploma,4.750269
7,High School Diploma,4.000082
8,apprenticehip in the field of cooking in culin...,1.913797
9,High school,
10,Master's degree in Mechanical engineering (Tec...,4.747531
11,High school or equivalent,


Combine into original DF with ID

In [12]:
edu_df = edu_df[['id']]
expanded_resume_degree = pd.merge(edu_df, simplified_degree_df, left_index=True, right_index=True)
expanded_resume_degree.head(10)

Unnamed: 0,id,degree,degree_year_time
2,f8dc5fa9fc39c387,High School Diploma in Technical Theater,
3,0e84aa0202047520,Certificate in French Immersion,
4,67b00f0985c8948a,in Mechanics and German Language Studies,
5,89c76f9c1242c552,Professional cooks lvl 1 in Culinary arts,0.917199
6,89c76f9c1242c552,High school diploma,4.750269
7,84ebc2bebda72365,High School Diploma,4.000082
8,017658f6f804297c,apprenticehip in the field of cooking in culin...,1.913797
9,b61c261f28ab69be,High school,
10,708b408f8988a22e,Master's degree in Mechanical engineering (Tec...,4.747531
11,6f73b4d1ab8579ad,High school or equivalent,


# Expanding skills data

Expand skills data to multiple rows

In [8]:
skills_df = c.expand_to_multi_rows(dataset, 'skills', 'skill_dict')
skills_df.head(10)

Unnamed: 0,additional,id,jobs,schools,skills,summary,skill_dict
0,"[Skills, -Good writing and reading abilities.,...",0e84aa0202047520,"[{'title': 'Line Cook and Prep Cook', 'company...","[{'degree': 'Certificate in French Immersion',...","[{'skill': 'COOKING', 'experience': '4 years'}...","[Well organized, focused individual who strive...","{'skill': 'COOKING', 'experience': '4 years'}"
1,"[Skills, -Good writing and reading abilities.,...",0e84aa0202047520,"[{'title': 'Line Cook and Prep Cook', 'company...","[{'degree': 'Certificate in French Immersion',...","[{'skill': 'COOKING', 'experience': '4 years'}...","[Well organized, focused individual who strive...","{'skill': 'HAND TOOLS', 'experience': 'Less th..."
2,"[Skills, -Good writing and reading abilities.,...",0e84aa0202047520,"[{'title': 'Line Cook and Prep Cook', 'company...","[{'degree': 'Certificate in French Immersion',...","[{'skill': 'COOKING', 'experience': '4 years'}...","[Well organized, focused individual who strive...","{'skill': 'WHMIS', 'experience': 'Less than 1 ..."
3,[Currently looking for full time job (prefere ...,708b408f8988a22e,"[{'title': 'Dishwasher/Line Cook', 'company': ...",[{'degree': 'Master's degree in Mechanical eng...,"[{'skill': 'Microsoft Office', 'experience': '...",[],"{'skill': 'Microsoft Office', 'experience': '9..."
4,[Currently looking for full time job (prefere ...,708b408f8988a22e,"[{'title': 'Dishwasher/Line Cook', 'company': ...",[{'degree': 'Master's degree in Mechanical eng...,"[{'skill': 'Microsoft Office', 'experience': '...",[],"{'skill': 'Inventor', 'experience': '8 years'}"
5,[Currently looking for full time job (prefere ...,708b408f8988a22e,"[{'title': 'Dishwasher/Line Cook', 'company': ...",[{'degree': 'Master's degree in Mechanical eng...,"[{'skill': 'Microsoft Office', 'experience': '...",[],"{'skill': 'Cad', 'experience': '8 years'}"
6,[Currently looking for full time job (prefere ...,708b408f8988a22e,"[{'title': 'Dishwasher/Line Cook', 'company': ...",[{'degree': 'Master's degree in Mechanical eng...,"[{'skill': 'Microsoft Office', 'experience': '...",[],"{'skill': 'Creo Parametric', 'experience': '2 ..."
7,"[Skills, ● High energy attitude, ● Sales exper...",c7675c0b793f34a0,"[{'title': 'Manager', 'company': 'Pizza Hut', ...","[{'degree': 'Certificate', 'school_name': 'Geo...","[{'skill': 'CUSTOMER RELATIONS', 'experience':...",[Looking for part-time position. Goal focused ...,"{'skill': 'CUSTOMER RELATIONS', 'experience': ..."
8,"[Skills, ● High energy attitude, ● Sales exper...",c7675c0b793f34a0,"[{'title': 'Manager', 'company': 'Pizza Hut', ...","[{'degree': 'Certificate', 'school_name': 'Geo...","[{'skill': 'CUSTOMER RELATIONS', 'experience':...",[Looking for part-time position. Goal focused ...,"{'skill': 'CUSTOMER SUPPORT', 'experience': 'L..."
9,"[Skills, ● High energy attitude, ● Sales exper...",c7675c0b793f34a0,"[{'title': 'Manager', 'company': 'Pizza Hut', ...","[{'degree': 'Certificate', 'school_name': 'Geo...","[{'skill': 'CUSTOMER RELATIONS', 'experience':...",[Looking for part-time position. Goal focused ...,"{'skill': 'PROBLEM SOLVING', 'experience': 'Le..."


Create a DF using the dictionary of skills

In [9]:
expanded_skill_df = c.create_new_df_dict_col(skills_df, 'skill_dict')
expanded_skill_df.head(10)

Unnamed: 0,skill,experience
0,COOKING,4 years
1,HAND TOOLS,Less than 1 year
2,WHMIS,Less than 1 year
3,Microsoft Office,9 years
4,Inventor,8 years
5,Cad,8 years
6,Creo Parametric,2 years
7,CUSTOMER RELATIONS,Less than 1 year
8,CUSTOMER SUPPORT,Less than 1 year
9,PROBLEM SOLVING,Less than 1 year


Simplify skills information

In [10]:
c.simplify_skills_information_(expanded_skill_df, 'experience')
expanded_skill_df.head(10)

Unnamed: 0,skill,experience
0,COOKING,4
1,HAND TOOLS,1
2,WHMIS,1
3,Microsoft Office,9
4,Inventor,8
5,Cad,8
6,Creo Parametric,2
7,CUSTOMER RELATIONS,1
8,CUSTOMER SUPPORT,1
9,PROBLEM SOLVING,1


Combine into original DF with ID

In [17]:
skills_df = skills_df[['id']]
expanded_resume_skills = pd.merge(skills_df, expanded_skill_df, left_index=True, right_index=True)
expanded_resume_skills.head(10)

Unnamed: 0,id,skill,experience
0,0e84aa0202047520,COOKING,4
1,0e84aa0202047520,HAND TOOLS,1
2,0e84aa0202047520,WHMIS,1
3,708b408f8988a22e,Microsoft Office,9
4,708b408f8988a22e,Inventor,8
5,708b408f8988a22e,Cad,8
6,708b408f8988a22e,Creo Parametric,2
7,c7675c0b793f34a0,CUSTOMER RELATIONS,1
8,c7675c0b793f34a0,CUSTOMER SUPPORT,1
9,c7675c0b793f34a0,PROBLEM SOLVING,1


# Clean summary data

In [14]:
pd.set_option('display.max_colwidth', 200)
cleaned_summary = c.clean_array_of_sentences(dataset, 'summary')
cleaned_summary.head(10)

0                                                                                                                               [obtain, parttime, job, atmosphere, use, improve, customer, service, skills]
1                                                                                                                                                                                                         []
2    [well, organized, focused, individual, strives, enhance, workplace, enjoy, contributing, participating, kind, healthy, work, environment, prepared, offer, support, needed, adaptable, alert, attent...
3                                                                                                                                                                                        [driving, position]
4                                                                                                                                                                                   

# Clean additional data

In [16]:
pd.set_option('display.max_colwidth', 200)
# just 
cleaned_additional = c.clean_array_of_sentences(dataset, 'additional', clean_punctuation=False)
cleaned_additional.head(50,)

0     [skills, &, abilities, :, , young, ,, energetic, hard, working, ., , responsible, punctual, ., , able, work, independently, team, player, ., , adaptable, willing, learn, new, concepts, ., , excell...
1     [skills, , reliability, , leadership, management, , research, information, gathering, skills, , listening, skills, , creativity, innovation, , ability, manage, organize, information, , sense, resp...
2     [skills, good, writing, reading, abilities, ., able, speak, write, french, english, ., familiar, electronics, computer, equipment, ., proficient, computer, internet, programs, ., able, focus, well...
3     [, experienced, handson, worker, ;, construction, experience, ,, driving, , responsible, worker, , polite, , accurate, ;, pay, attention, detail, , good, physical, condition, eager, work, , good, ...
4                                                                                                                                          [currently, looking, part, time, work