In [105]:
import pandas as ps
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics import pairwise

In [148]:
positions = ps.read_csv('slack/data/vl_onet_nomralized.csv', encoding='utf-8')

In [117]:
positions.shape

(61345, 4)

In [73]:
positions.head(5)

Unnamed: 0,SOC_Code,Title,Alternate_Title,Source
0,11-1011.00,Chief Executives,Aeronautics Commission Director,8
1,11-1011.00,Chief Executives,Agricultural Services Director,8
2,11-1011.00,Chief Executives,Alcohol and Drug Abuse Assistance Program Admi...,8
3,11-1011.00,Chief Executives,Arts and Humanities Council Director,8
4,11-1011.00,Chief Executives,Bakery Manager,8


In [106]:
def preprocessInput(inputPos):
    output = ''
    stop_words = set(stopwords.words('english'))
    outputPos = re.sub('[^a-zA-Z0-9]', ' ', inputPos)
    #collapase whitespaces, and lower case
    outputPos = re.sub(r'\W+', ' ', outputPos).lower()
        
    outputPosList = outputPos.split(' ')
        
    for w in outputPosList:
        if w not in stop_words:
            output = output + w + ' '
    
    output = output.rstrip()
    
    return output

In [107]:
def ngrams(string, n=3):
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [76]:
#test
#preprocessInput('Aeronautics Commission Director')
ngrams(preprocessInput('Aeronautics Commission Director'))

['aer',
 'ero',
 'ron',
 'ona',
 'nau',
 'aut',
 'uti',
 'tic',
 'ics',
 'cs ',
 's c',
 ' co',
 'com',
 'omm',
 'mmi',
 'mis',
 'iss',
 'ssi',
 'sio',
 'ion',
 'on ',
 'n d',
 ' di',
 'dir',
 'ire',
 'rec',
 'ect',
 'cto',
 'tor']

In [77]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(positions['Alternate_Title'])

In [78]:
print(tf_idf_matrix[0])

  (0, 930)	0.2538928373626772
  (0, 4690)	0.2496309399736001
  (0, 7309)	0.1797359733701588
  (0, 6675)	0.17598157738619133
  (0, 6247)	0.30119541051660403
  (0, 3802)	0.27398453671218725
  (0, 8115)	0.2031204909473183
  (0, 7802)	0.15188182591903876
  (0, 5334)	0.18649672115279176
  (0, 4146)	0.18903739930676516
  (0, 7415)	0.18365502318156346
  (0, 70)	0.12461080288201144
  (0, 1374)	0.15889713156406035
  (0, 6664)	0.18546539862255665
  (0, 6101)	0.23478288080537368
  (0, 6091)	0.20778766722790035
  (0, 5535)	0.21850338573012396
  (0, 7599)	0.15330910098219244
  (0, 7547)	0.17940614594979037
  (0, 5471)	0.1116332354601592
  (0, 6670)	0.12083035975646189
  (0, 6164)	0.20331439940355003
  (0, 82)	0.15186159101040733
  (0, 1515)	0.1591741019168768
  (0, 5507)	0.13962953644915976
  (0, 7182)	0.15410085091347564
  (0, 4498)	0.1231384038759003
  (0, 4157)	0.1274325936336584
  (0, 7847)	0.08509889734028017


In [146]:
#add user input in TF-IDF matrix
def addUserInput(userInput, positionList):
    positionListAdded = positionList
    
    positionListAdded.loc[len(positionListAdded)] = userInput
    
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(positionListAdded)
    positionListAdded['index_col'] = positionListAdded.index
    return tf_idf_matrix,positionListAdded

In [149]:
positionList = ps.DataFrame(positions['Alternate_Title'])
print(positionList.shape)
tf_idf_matrix,positionListAdded = addUserInput('Aeronautics Commission Director', positionList)
print(tf_idf_matrix.shape)
print(positionListAdded.shape)

(61345, 1)
(1, 13)
(61346, 2)


In [150]:
positionListAdded.head(5)

Unnamed: 0,Alternate_Title,index_col
0,Aeronautics Commission Director,0
1,Agricultural Services Director,1
2,Alcohol and Drug Abuse Assistance Program Admi...,2
3,Arts and Humanities Council Director,3
4,Bakery Manager,4


In [130]:
cos_sim = pairwise.cosine_similarity(tf_idf_matrix[61345], tf_idf_matrix)

In [155]:
cos_sim.shape

(1, 61346)

In [156]:
print(cos_sim[0][1])

0.14635218718590962


In [157]:
type(cos_sim)

numpy.ndarray

In [161]:
cos_sim_df = ps.DataFrame({'cos_sim':cos_sim[0,:]})

In [162]:
cos_sim_df.shape

(61346, 1)

In [163]:
cos_sim_df.head(5)

Unnamed: 0,cos_sim
0,1.0
1,0.146352
2,0.021808
3,0.16856
4,0.0


In [164]:
result = ps.concat([positionListAdded, cos_sim_df], axis=1)

In [165]:
result.shape

(61346, 3)

In [166]:
result.head(5)

Unnamed: 0,Alternate_Title,index_col,cos_sim
0,Aeronautics Commission Director,0,1.0
1,Agricultural Services Director,1,0.146352
2,Alcohol and Drug Abuse Assistance Program Admi...,2,0.021808
3,Arts and Humanities Council Director,3,0.16856
4,Bakery Manager,4,0.0


In [167]:
result_sorted = result.sort_values(by=['cos_sim'], ascending=False)

In [168]:
result_sorted.head(5)

Unnamed: 0,Alternate_Title,index_col,cos_sim
0,Aeronautics Commission Director,0,1.0
61345,Aeronautics Commission Director,61345,1.0
12411,Aeronautics Teacher,12411,0.614716
58190,Aeronautical Inspector,58190,0.563063
6301,Aeronautical Engineer,6301,0.522373


In [172]:
result_sorted.loc[result_sorted['cos_sim'] < 1].head(1)['Alternate_Title']

12411    Aeronautics Teacher
Name: Alternate_Title, dtype: object