In [7]:
import pandas as pd
import re

In [3]:
resume_df = pd.read_csv('UpdatedResumeDataSet.csv')
resume_df.head

<bound method NDFrame.head of          Category                                             Resume
0    Data Science  Skills * Programming Languages: Python (pandas...
1    Data Science  Education Details \nMay 2013 to May 2017 B.E  ...
2    Data Science  Areas of Interest Deep Learning, Control Syste...
3    Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4    Data Science  Education Details \n MCA   YMCAUST,  Faridabad...
..            ...                                                ...
957       Testing  Computer Skills: â¢ Proficient in MS office (...
958       Testing  â Willingness to accept the challenges. â ...
959       Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...
960       Testing  COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
961       Testing  Skill Set OS Windows XP/7/8/8.1/10 Database MY...

[962 rows x 2 columns]>

In [24]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import split_on_space
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_punctuation


def clean_strings(data):
    data = remove_stopwords(data)
    data = strip_multiple_whitespaces(data)
    data = strip_non_alphanum(data)
    data = strip_punctuation(data)
    data = split_on_space(data.lower())
    return data

In [17]:
x = "Better late than never, but the never late"
y = re.sub(r'\s+', '_', x)
y

'Better_late_than_never,_but_the_never_late'

In [33]:
def read_and_clean_data(filename):
    resume_df = pd.read_csv(filename)
    column_names = ["Category", "Resume"]
    # all_data = {}
    categories = []
    resumes = []
    for i, row in resume_df.iterrows():
        category = row[column_names[0]].lower()
        category = re.sub(r'\s+','_',category)
        resume = row[column_names[1]]
        resume = clean_strings(resume)
        categories.append(category)
        resumes.append(resume)
        # print(category)
        # print(resume)
        # break
        # if category in all_data.keys():
        #     all_data[category].append(resume)
        # else:
        #     all_data[category] = [resume]
    return categories, resumes

In [None]:
from collections import Counter
# Counter(all_data.keys())
categories, resumes = read_and_clean_data('UpdatedResumeDataSet.csv') 
print(resumes[0])

In [38]:
from sklearn.model_selection import train_test_split

# train test split
def split_training_set(jds, labels, test_size=0.3, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(jds, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [39]:
X_train, X_test, y_train, y_test = split_training_set(resumes, categories)


Training set label counts: Counter({'java_developer': 59, 'testing': 49, 'devops_engineer': 38, 'python_developer': 34, 'hr': 31, 'web_designing': 31, 'hadoop': 29, 'sales': 28, 'etl_developer': 28, 'data_science': 28, 'blockchain': 28, 'operations_manager': 28, 'mechanical_engineer': 28, 'arts': 25, 'database': 23, 'health_and_fitness': 21, 'electrical_engineering': 21, 'pmo': 21, 'business_analyst': 20, 'dotnet_developer': 20, 'automation_testing': 18, 'sap_developer': 17, 'network_security_engineer': 17, 'civil_engineer': 17, 'advocate': 14})
Test set     label counts: Counter({'java_developer': 25, 'testing': 21, 'devops_engineer': 17, 'python_developer': 14, 'web_designing': 14, 'hr': 13, 'hadoop': 13, 'mechanical_engineer': 12, 'data_science': 12, 'blockchain': 12, 'etl_developer': 12, 'sales': 12, 'operations_manager': 12, 'arts': 11, 'database': 10, 'pmo': 9, 'electrical_engineering': 9, 'health_and_fitness': 9, 'business_analyst': 8, 'dotnet_developer': 8, 'network_security_en

In [40]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def tag_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield TaggedDocument(list_of_words, [i])

In [41]:
def train_model(data):
    
    # tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(data)]

    tagged_data = list(tag_document(data))
    # initialize model
    model = Doc2Vec(vector_size=50, min_count=10, epochs=50)
    # vocabulary building
    model.build_vocab(tagged_data)
   
    print(type(tagged_data))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model_name = 'doc2vec_resumes_trained_model'
    model.save(model_name)
    print("Model saved")
    return model_name

In [42]:
# train model
model_name = train_model(X_train)


<class 'list'>
Model saved


In [43]:
# get model
# model _2 is trained with larger dataset
baseline_model = Doc2Vec.load(model_name)

In [44]:

training_data = list(tag_document(X_train))

In [48]:
ranks = []
second_ranks = []
for doc_id in range(len(training_data)):
    inferred_vector = baseline_model.infer_vector(training_data[doc_id].words)
    sims = baseline_model.dv.most_similar([inferred_vector], topn=len(baseline_model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print(counter)

Counter({0: 161, 1: 151, 2: 133, 3: 84, 4: 61, 5: 41, 6: 16, 8: 11, 7: 6, 12: 4, 9: 2, 10: 2, 11: 1})


In [49]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(training_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % baseline_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(training_data[sims[index][0]].words)))

Document (672): «computer skills â proficient ms office word basic excel power point strength â hard working loyalty creativity â self motivated responsible initiative â good people management skill positive attitude â knowledge windows internet education details bachelor electrical engineering electrical engineering nashik maharashtra guru gobind singh college engineering research centre diploma electrical engineering nashik maharashtra s m e s polytechnic college testing engineer skill details excel exprience 6 months ms office exprience 6 months word exprience 6 monthscompany details company description department testing responsibilities â to check acb vcb circuit breaker â following test conducted circuit breaker drawing 1 to check breaker timing 2 to check contact resistance contact resistance meter crm 3 to check breaker insulation resistance ir 4 to check breaker rack rack properly not 5 to check closing tripping operation work properly not â to check following test conducted m

In [50]:
cat_dict_test_clean = {cat: [] for cat in y_test}
# print(cat_dict_test_clean)
for i, v in enumerate(X_test):
    cat_dict_test_clean[y_test[i]].append(v)

In [51]:
metadata = {}
infered_vector_test = {} # contains, categor-wise, inferred doc vecs for each document in the test set
for cat, docs in cat_dict_test_clean.items():
    infered_vector_test[cat] = [baseline_model.infer_vector(doc) for doc in list(docs)]
    metadata[cat] = len(infered_vector_test[cat])
print(infered_vector_test['sales'][0])
print(metadata)

[-0.13070385 -1.949679   -0.44304395 -1.211238   -0.844497   -0.22970463
 -0.33146343  3.1134596  -2.2058005   1.0508901   0.5033362   0.42310044
 -0.3836528   1.159766    0.5759378   0.43354654  1.8294128   1.161793
 -0.6728141  -0.25289503  0.6305693   0.40217894  0.31130546  0.43573642
  0.18783663  0.88018817 -1.0197024  -0.40984538 -0.05040639  0.31810322
  1.249868   -0.25906286  0.89540213  1.3869416   0.7958478   0.17205253
 -0.08033868 -0.9815042  -0.335269    2.3027966   0.64444304  1.361534
 -2.0009284  -2.0399024   1.0825485  -0.9149722   0.34574512 -1.8197381
  0.4148944   0.7025619 ]
{'python_developer': 14, 'business_analyst': 8, 'hr': 13, 'web_designing': 14, 'devops_engineer': 17, 'mechanical_engineer': 12, 'pmo': 9, 'database': 10, 'data_science': 12, 'dotnet_developer': 8, 'java_developer': 25, 'hadoop': 13, 'network_security_engineer': 8, 'advocate': 6, 'sap_developer': 7, 'civil_engineer': 7, 'electrical_engineering': 9, 'arts': 11, 'automation_testing': 8, 'blockc

In [52]:
import csv

def write_to_csv(input, output_file, delimeter='\t'):
    with open(output_file, "w") as f:
        writer = csv.writer(f, delimiter=delimeter)
        writer.writerows(input)
        
veclist_metadata = []
veclist = []

for cat in cat_dict_test_clean.keys():
    for tag in [cat]*metadata[cat]:
        veclist_metadata.append([tag])
    for vec in infered_vector_test[cat]:
        veclist.append(list(vec))
        
write_to_csv(veclist, "doc2vec_hf_resumes_test_vectors.csv")
write_to_csv(veclist_metadata, "doc2vec_hf_resumes_test_vectors_metadata.csv")