# KNN Model

In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.neighbors import NearestNeighbors

## Jobs data

In [75]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

In [76]:
jobs

Unnamed: 0,job_description,job_title,location,skills,text
0,an EDI Analyst with experience please read on...,Analyst,Northeast United States,edi trustedlink as van,an EDI Analyst with experience please read on...
1,Informatica ETL DeveloperSt Petersburg FL Only...,Developer,Southern United States,etl informatica b data exchange netezza oracle...,Informatica ETL DeveloperSt Petersburg FL Only...
2,This nationally recognized Microsoft Gold Part...,Manager,Western United States,microsoft dynamics ax project manager - toront...,This nationally recognized Microsoft Gold Part...
3,a .NET Developer with experience please read ...,Developer,Northeast United States,c asp.net sql javascript mvc,a .NET Developer with experience please read ...
4,Hatstand a global financial consultancy is see...,Developer,Northeast United States,java linux unix sdlc; multi-threaded or concur...,Hatstand a global financial consultancy is see...
...,...,...,...,...,...
16431,JPMorgan Chase & Co. (NYSE: JPM) is a leadin...,Developer,Northeast United States,.net architecture developer development git ht...,JPMorgan Chase & Co. (NYSE: JPM) is a leadin...
16432,Seeking Jr. Systems Administrators with experi...,Administrator,Midwest United States,jr. linux administrator,Seeking Jr. Systems Administrators with experi...
16433,a Senior Lead Devops Engineer with a desired ...,Developer,Midwest United States,amazon web services linux bash ruby python agile,a Senior Lead Devops Engineer with a desired ...
16434,Headquartered in downtown San Francisco CA we ...,Developer,Western United States,javascript react.js golang startup ror iot ana...,Headquartered in downtown San Francisco CA we ...


## User data

In [4]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    return profile_data

In [5]:
# Reading in Zach's linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,Zachary Brown,"Data Science Fellow, Python Developer, Health ...","Data Analysis, Python (Programming Language), ...",I bridge the gap between data and climate poli...,"Data Science Intensive, Bachelor's of Science",Microsoft Certified: Azure Data Scientist Asso...,"Data Science Fellow, Python Developer, Health ..."


In [175]:
# Reading in Nolan's linkedin profile data.
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,Nolan Arendt,"Data Science Fellow, Painter","Data Science, Python, Data Analysis, Data Mana...",An innovative Data Scientist who is passionate...,"Bachelor's degree, Software Boot Camp Certificate","Data Science Fellow, Painter Data Science, Pyt..."


In [178]:
# Reading in Albert's linkedin profile data.
profile_data_albert = gather_profile_data('../data/linkedin/test-output/Albert_LinkedInData_12-16-2020.csv')
profile_data_albert

Unnamed: 0,Name,Titles,Skills,Summary,Education,Projects,Certifications,text
0,Albert Frantz,"Data Science Fellow, Assistant Teacher, Classr...","R, Python, Data Analysis, Econometrics, Projec...",I am a detail-oriented data scientist that use...,"Bachelor of Arts - BA, nan, nan",Identifying the Relationship Between Bike Lane...,Tableau 2020 A-Z: Hands-On Tableau Training fo...,"Data Science Fellow, Assistant Teacher, Classr..."


## Make recommendations

In [210]:
def get_recommendations(vectorizer, user_data):
    # Fit transform on text data
    tfidf_jobtext = tfidf_vectorizer.fit_transform(jobs['text'])

    # Transforming user profile text
    user_tfidf = tfidf_vectorizer.transform(user_data['text'])

    # Calculating KNN similarity between users profile and job text (top 100 most similar jobs)
    n_neighbors=75
    KNN = NearestNeighbors(n_neighbors=n_neighbors, p=2, metric='cosine', algorithm = 'brute')
    KNN.fit(tfidf_jobtext)
    NNs = KNN.kneighbors(user_tfidf, return_distance=True)

    # Finding indexs for n_neighbors most similar jobs
    index = list(NNs[1][0][1:]) # indexs for top jobs
    final_jobs = jobs.loc[index] 
    
    # Create a dataframe using our job title counts(top 10)
    pos_df = pd.DataFrame(final_jobs['job_title'].value_counts()[:10])
    
    # Renaming job_title
    pos_df['Job Count'] = pos_df['job_title']
    pos_df.drop(columns='job_title', inplace= True)
    
    # Creating column for percent of jobs matched
    pos_df['Job Match %'] = pos_df['Job Count']/n_neighbors
    
    return pos_df

## Specific Recommendations

In [227]:
# Instantiating Tfidfvectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

In [228]:
nolans_recommendations = get_recommendations(tfidf_vectorizer, profile_data_nolan)

In [229]:
nolans_recommendations

Unnamed: 0,Job Count,Job Match %
Data Position,26,0.346667
Developer,17,0.226667
Engineer,12,0.16
Architect,9,0.12
Analyst,5,0.066667
Consulting,3,0.04
Director,1,0.013333
Manager,1,0.013333


In [230]:
zachs_recommendations = get_recommendations(tfidf_vectorizer, profile_data_zach)

In [231]:
zachs_recommendations

Unnamed: 0,Job Count,Job Match %
Data Position,31,0.413333
Analyst,15,0.2
Engineer,9,0.12
Architect,8,0.106667
Developer,7,0.093333
Consulting,1,0.013333
Programmer,1,0.013333
Director,1,0.013333
Manager,1,0.013333


In [232]:
alberts_recommendations = get_recommendations(tfidf_vectorizer, profile_data_albert)

In [233]:
alberts_recommendations

Unnamed: 0,Job Count,Job Match %
Data Position,27,0.36
Architect,13,0.173333
Analyst,12,0.16
Engineer,9,0.12
Developer,9,0.12
Manager,2,0.026667
Director,1,0.013333
Support,1,0.013333
