# Simple Classification Model with SVM
## Purpose
Build a classification mode using svm which can be used to predict the job title based on the skill set

## Steps
1. Normalize title and skill fields
2. Vectorize skill
3. Label encode titles
4. Create SVM model and train the model
5. Make prediction: Vectorize skillset and use it as an input to the model, inverse transform the result into human readable text

## Method Used
* Support Vector Machine  
* Cross Validation

## Conclusion
* Prediction works as expected
* Cross validation score isn't good enough since the job titles in the data are not normalized enough

In [1]:
import json
import re
import unidecode
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from tqdm import tqdm
import nltk
from matplotlib import pyplot as plt
import seaborn as sns
tqdm.pandas()
from nltk import word_tokenize
from nltk.corpus import stopwords

In [2]:
train = pd.read_csv('../00-data/jobdb.csv')

In [3]:
train['title_simp']=''
train.head()

Unnamed: 0,title,company,location,description,salary,link,skills,education,title_simp
0,Computer Vision Scientist/Engineer,"Expedition Technology, Inc.","Dulles, VA",Computer Vision Scientist/Engineer\r\r\r\n\r\r...,,,"r,python,c/c++,julia,c++,pytorch","bs,phd,ms",
1,Staff Computer Vision Scientist,WeWork,"Palo Alto, CA",Research Scientist/Manager:\r\r\r\nComputer Vi...,,https://www.glassdoor.com/partner/jobListing.h...,"r,python,c++","phd,ms",
2,"Data Scientist, Deep Learning/Computer Vision",Yummly,"Redwood City, CA",Born from the belief that great things come to...,$124K-$170K (Glassdoor est.),https://www.glassdoor.com/partner/jobListing.h...,"python,r",ms,
3,Computer Vision Research Engineer,"Expedition Technology, Inc.","Dulles, VA",Computer Vision Research Engineer\r\r\r\n\r\r\...,,,"python,r","bachelor,ms",
4,Senior Software Engineer for OFP Development F...,Decisive Analytics,"Dayton, OH",Overview\r\r\r\n\r\r\r\nDECISIVE ANALYTICS Cor...,$53K-$94K (Glassdoor est.),https://www.glassdoor.com/partner/jobListing.h...,r,"bs,ms",


In [1]:
#get the skills and concatenate using space
def preprocess(ingredients):
    words = []
    for word in str(ingredients).split(','):
        if len(word) > 0: words.append(word)
    return ' '.join(words)

#lower case the description and remove stopwords
def tokenize_title(description):
    tokens = word_tokenize(description)
    stopset = set(stopwords.words('english'))
    tokens = [w.lower() for w in tokens if not w in stopset]
    text = ' '.join(nltk.Text(tokens))
    return text

In [5]:
train['x'] = train['skills'].progress_apply(preprocess)

100%|████████████████████████████████████████████████████████████████████████████| 677/677 [00:00<00:00, 225557.53it/s]


In [6]:
train['title_simp']=train['title'].apply(tokenize_title)
train.head()

Unnamed: 0,title,company,location,description,salary,link,skills,education,title_simp,x
0,Computer Vision Scientist/Engineer,"Expedition Technology, Inc.","Dulles, VA",Computer Vision Scientist/Engineer\r\r\r\n\r\r...,,,"r,python,c/c++,julia,c++,pytorch","bs,phd,ms",computer vision scientist/engineer,r python c/c++ julia c++ pytorch
1,Staff Computer Vision Scientist,WeWork,"Palo Alto, CA",Research Scientist/Manager:\r\r\r\nComputer Vi...,,https://www.glassdoor.com/partner/jobListing.h...,"r,python,c++","phd,ms",staff computer vision scientist,r python c++
2,"Data Scientist, Deep Learning/Computer Vision",Yummly,"Redwood City, CA",Born from the belief that great things come to...,$124K-$170K (Glassdoor est.),https://www.glassdoor.com/partner/jobListing.h...,"python,r",ms,"data scientist , deep learning/computer vision",python r
3,Computer Vision Research Engineer,"Expedition Technology, Inc.","Dulles, VA",Computer Vision Research Engineer\r\r\r\n\r\r\...,,,"python,r","bachelor,ms",computer vision research engineer,python r
4,Senior Software Engineer for OFP Development F...,Decisive Analytics,"Dayton, OH",Overview\r\r\r\n\r\r\r\nDECISIVE ANALYTICS Cor...,$53K-$94K (Glassdoor est.),https://www.glassdoor.com/partner/jobListing.h...,r,"bs,ms",senior software engineer ofp development f-15 ...,r


In [7]:
#Verctorize the skills as predictors
vectorizer = make_pipeline(
    TfidfVectorizer(sublinear_tf=True),
    FunctionTransformer(lambda x: x.astype('float16'), validate=False)
)

x_train = vectorizer.fit_transform(train['x'].values)
x_train.sort_indices()

In [8]:
#label encode the title as target
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['title_simp'].values)
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'-computer vision research scientist': 0,
 'ai bootcamp experienced machine learning engineers data scientists': 1,
 'ai research scientist - computer vision': 2,
 'applied artificial intelligence machine learning-scientist/engineer ( patuxent river , md orlando , florida )': 3,
 'applied research scientist , computer vision': 4,
 'applied research scientist - machine learning': 5,
 'applied scientist , machine learning': 6,
 'applied scientist - artificial intelligence': 7,
 'applied scientist - computer vision': 8,
 'c++ computer vision software engineer': 9,
 'c++ software engineer - advanced visualization': 10,
 'camera software - imaging computer vision research engineer': 11,
 'co-op software engineer - machine learning ( summer & fall 2019 )': 12,
 'computer scientist ( machine learning ) , vision systems group': 13,
 'computer scientist , vision & learning': 14,
 'computer scientist / neural networks , computer vision ( active ts/sci )': 15,
 'computer scientist/vision enginee

In [15]:
#Initialize model with rbf kernel
classifier = SVC(C=250, # penalty parameter
                kernel='rbf', # kernel type, rbf working fine here
                gamma=1, # kernel coefficient
                coef0=1, # change to 1 from default value of 0.0
                shrinking=True, # using shrinking heuristics
                tol=0.001, # stopping criterion tolerance
                cache_size=500, # 500 MB cache size  
                max_iter=-1, # no limit, let it run
                random_state=None)
#model = OneVsRestClassifier(classifier, n_jobs=4)

In [16]:
#Cross validation
scores = cross_validate(classifier, x_train, y_train, cv=3)
scores['test_score'].mean()



0.35696601380197884

In [48]:
classifier.fit(x_train, y_train)

SVC(C=250, cache_size=500, class_weight=None, coef0=1,
  decision_function_shape='ovr', degree=3, gamma=1.4, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
y_pred = label_encoder.inverse_transform(classifier.predict(x_train))
y_true = label_encoder.inverse_transform(y_train)

print(f'accuracy score on train data: {accuracy_score(y_true, y_pred)}')

def report2dict(cr):
    rows = []
    for row in cr.split("\n"):
        parsed_row = [x for x in row.split("  ") if len(x) > 0]
        if len(parsed_row) > 0: rows.append(parsed_row)
    measures = rows[0]
    classes = defaultdict(dict)
    for row in rows[1:]:
        class_label = row[0]
        for j, m in enumerate(measures):
            classes[class_label][m.strip()] = float(row[j + 1].strip())
    return classes
report = classification_report(y_true, y_pred)
pd.DataFrame(report2dict(report)).T

accuracy score on train data: 0.4194977843426883


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,f1-score,precision,recall,support
-computer vision research scientist,0.00,0.00,0.00,2.0
ai bootcamp experienced machine learning engineers data scientists,0.00,0.00,0.00,1.0
ai research scientist - computer vision,0.00,0.00,0.00,2.0
"applied artificial intelligence machine learning-scientist/engineer ( patuxent river , md orlando , florida )",0.00,0.00,0.00,1.0
"applied research scientist , computer vision",0.00,0.00,0.00,1.0
applied research scientist - machine learning,0.00,0.00,0.00,1.0
"applied scientist , machine learning",1.00,1.00,1.00,1.0
applied scientist - artificial intelligence,0.00,0.00,0.00,1.0
applied scientist - computer vision,0.00,0.00,0.00,2.0
c++ computer vision software engineer,0.00,0.00,0.00,2.0


In [50]:
test=pd.read_csv('test.csv')
test.head

<bound method NDFrame.head of    title      skills
0    NaN        java
1    NaN      python
2    NaN  python,c++>

In [51]:
test['x'] = train['skills'].progress_apply(preprocess)



  0%|                                                                                          | 0/677 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████| 677/677 [00:00<00:00, 339333.63it/s]

In [52]:
x_test = vectorizer.transform(test['x'].values)

In [53]:
y_pred = label_encoder.inverse_transform(classifier.predict(x_test))

In [58]:
test['x'].head

<bound method NDFrame.head of 0    r python c/c++ julia c++ pytorch
1                        r python c++
2                            python r
Name: x, dtype: object>

In [57]:
print(y_pred)

['machine learning engineer' 'computer vision engineer'
 'computer vision engineer']
