In [6]:
import pandas as pd
import flair
from bpemb import BPEmb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist
import regex as re

In [7]:
full_files_assignment = [f"data/assignment/{x}/assignment.txt" for x in os.listdir('data/assignment')]
full_files_data_assignment = [open(x).read() for x in full_files_assignment if not 'annot' in x and not '.DS' in x]

resume = [f"data/resume_clean/{x}" for x in os.listdir('data/resume_clean')]
resume_data = [open(x).read() for x in resume]

In [8]:
multi_space = re.compile("[_ \t]+")
non_ascii = re.compile("[^\x00-\x7FåäöÅÄÖ\s\n\p{Punct}]+")
punct = re.compile('([\p{Punct}\n])+')

In [13]:
def clean(text):
    t = non_ascii.sub('', text)
    t = punct.sub(' \g<1> ', t)
    t = multi_space.sub(' ', t)
    t = t.replace('\n', '.')
    t = t.lower()
    return t

In [14]:
clean_resumes = [clean(x) for x in resume_data]
clean_assignments = [clean(x) for x in full_files_data_assignment]

In [21]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=10)
vectorizer.fit(clean_assignments+clean_resumes)

TfidfVectorizer(max_df=0.5, min_df=10, stop_words='english')

In [22]:
x_resume = vectorizer.transform(clean_resumes)
x_assignment = vectorizer.transform(clean_assignments)

In [43]:
dists = cdist(x_resume.toarray(), x_assignment.toarray(), metric='cosine')
dists

array([[0.91253857, 0.81692922, 0.78064722, 0.78243857, 0.75278977,
        0.74745432, 0.94397702, 0.60061513, 0.79309386, 0.883031  ,
        0.78878295, 0.86744509, 0.79679015, 0.91259097, 0.84386954,
        0.72058578, 0.83538597, 0.92048386, 0.80897347, 0.85940277,
        0.79182331, 0.86460573, 0.71856869, 0.86911413, 0.79415437,
        0.70816466, 0.9241859 , 0.86855011, 0.93828158, 0.83011643,
        0.83466988, 0.97328222, 0.95852379, 0.92741138, 0.95310396,
        0.94430003, 0.92529487, 0.85722348, 0.80082875, 0.9009491 ,
        0.83391757, 0.746303  , 0.82201206, 0.89253107, 0.68976253,
        0.75467125, 0.8875436 , 0.9655355 , 0.96516792, 0.84226515,
        0.77252719, 0.76388872, 0.87218436, 0.780797  , 0.8015489 ,
        0.73939492, 0.81459954],
       [0.97823218, 0.82593314, 0.8175453 , 0.94118334, 0.91398597,
        0.93542266, 0.84232742, 0.93644611, 0.94668739, 0.89949417,
        0.98059278, 0.95404016, 0.79321939, 0.86410081, 0.80423656,
        0.82714

In [50]:
dists.argmin(axis=-1)

array([ 7, 50, 23])

In [52]:
print(dists[0][7])
print(dists[1][50])
print(dists[2][23])

0.60061513075041
0.6667388359427548
0.5603049151105187


In [48]:
for x in dists.argmin(axis=1):
    print(full_files_data_assignment[x])
    print("========")

Embedded SW Engineer, Reference 23189
Apply here
Assignment Description and Requirements
Requirements:
We believe you are a passionate embedded software engineer with at least a few years of relevant work experience. You have a university degree in Software Engineering or equivalent.
You are known as a team player with an open mindset.
You have knowledge of and relevant experience with:
- Embedded systems
- Linux
- C/C++
- Script languages
- Hardware near debugging (schematics, logic analyzer, board-bringup).
- Linux kernel is meritorious
- Git, Gerrit, Jenkins
- Jira, Confluence
- Agile software development process
- QNX is meritorious

Your Role:
As an Embedded Software Engineer, you will be a member of a team responsible for the low-level modules within our platform organisation developing the infotainment head unit system.
The system is an in-vehicle Linux based system running native Android or virtualized inside QNX hypervisor.
You will work with low-level software modules, board 

In [45]:
print(resume)

['data/resume_clean/2.txt', 'data/resume_clean/3.txt', 'data/resume_clean/1.txt']


In [34]:
import numpy as np

In [38]:
type(x_resume)

scipy.sparse.csr.csr_matrix

In [98]:
df = pd.read_csv('data/UpdatedResumeDataSet.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [99]:
df['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [100]:
df['clean_resume'] = df['Resume'].apply(lambda x: clean(x))
df = df.drop_duplicates()

In [101]:
df.head()

Unnamed: 0,Category,Resume,clean_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills * programming languages : python ( pand...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details \r . may 2013 to may 2017 b ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...","areas of interest deep learning , control syst..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...","education details \r . mca ymcaust , faridabad..."


In [102]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=10)
vectorizer.fit(clean_assignments+df['clean_resume'].tolist())

TfidfVectorizer(max_df=0.5, min_df=10, stop_words='english')

In [103]:
x_resume = vectorizer.transform(df['clean_resume'])
x_assignment = vectorizer.transform(clean_assignments)

In [104]:
dists = cdist(x_assignment.toarray(), x_resume.toarray(), metric='cosine')

In [105]:
dists

array([[0.93587529, 1.        , 0.96837396, ..., 0.97380711, 0.95561402,
        0.97647996],
       [0.91600525, 0.95404871, 0.89847006, ..., 0.92588916, 0.8193951 ,
        0.9326045 ],
       [0.92854102, 0.97358773, 0.9004353 , ..., 0.75503221, 0.94911954,
        0.76656755],
       ...,
       [0.94203378, 0.98961109, 0.93910386, ..., 0.96256575, 0.9503871 ,
        0.98497885],
       [0.94126643, 0.95637787, 0.93777025, ..., 1.        , 0.96448887,
        0.9636615 ],
       [0.91978724, 0.98416443, 0.87631496, ..., 0.90501216, 0.86000759,
        0.92576459]])

In [106]:
closest_dists = dists.argpartition(5, axis=1)[:,:5]

In [108]:
closest_dists[5]

array([ 96,  94,  97,  98, 159])

In [110]:
df.iloc[[96,  94,  97,  98, 159]]

Unnamed: 0,Category,Resume,clean_resume
483,Electrical Engineering,â¢ Achievement oriented with people managemen...,achievement oriented with people management s...
481,Electrical Engineering,Skills: 1) MC Office 2) AutoCAD 2016 3) Introd...,skills : 1 ) mc office 2 ) autocad 2016 3 ) in...
484,Electrical Engineering,Education Details \r\nJuly 2016 to May 2019 BE...,education details \r . july 2016 to may 2019 b...
485,Electrical Engineering,Education Details \r\nJanuary 2012 to January ...,education details \r . january 2012 to january...
892,Testing,â¢ Good logical and analytical skills â¢ Pos...,good logical and analytical skills positive a...


In [111]:
full_files_data_assignment[5]

'System design engineer: Electric & Electronic system powering management, Reference 22658\nApply here\nAssignment Description and Requirements\nWe are looking for a Systems Engineer for Electric & Electronic system powering management.\nThe Electronic Embedded System department is responsible for the overall electronic system used commonly in all machines. This requires the system to be modular, scalable, flexible, etc.\nYour role in the project will be as Systems Engineer with focus on Electric system. You will together with the project team, investigate, develop and document the system functionality required to meet the needs for different implementations of EE system in VCE machines. The work will include finding solutions to secure that the EE powering management system supports regular 12V/24V system services, connectivity services, electromobility services and autonomous services\nDrive the work from an operational and short-term perspective. Participate in specification impleme

In [115]:
from joblib import dump, load

In [116]:
dump(vectorizer, "tfidf_on_updated_resumes.joblib")

['tfidf_on_updated_resumes.joblib']

In [120]:
df.to_csv('clean_resume.csv')

In [134]:
data = [(x.split('\n')[0], '\n'.join(x.split('\n')[1:])) for x in full_files_data_assignment]
df_assignemnts = pd.DataFrame(data, columns=['title', 'text'])
df_assignemnts['clean_text'] = df_assignemnts['text'].apply(lambda x: clean(x))

In [135]:
df_assignemnts.head()

Unnamed: 0,title,text,clean_text
0,AD System designer / System Architect - Automo...,Ansök här\nUppdragsbeskrivning och krav\nWe ar...,ansök här . uppdragsbeskrivning och krav . we ...
1,"AD/ADAS HIL Engineer - Automotive, Referens 21803",Ansök här\nUppdragsbeskrivning och krav\nWe ar...,ansök här . uppdragsbeskrivning och krav . we ...
2,"SW engineer with extensive Test Automation, In...",Apply here\nAssignment Description and Require...,apply here . assignment description and requir...
3,"SW Component Owner, Referens 21757",Ansök här\nUppdragsbeskrivning och krav\nAs a ...,ansök här . uppdragsbeskrivning och krav . as ...
4,Function and system design for Autonomous Driv...,Ansök här\nUppdragsbeskrivning och krav\nWe ar...,ansök här . uppdragsbeskrivning och krav . we ...


In [136]:
df_assignemnts.to_csv('clean_assignments.csv')