<h3>Imports</h3>

In [1]:
import pandas as pd
import numpy
import re #for data preprocessing and string match
import nltk
import gensim 
import spacy
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from fuzzywuzzy import process, fuzz #for fuzzy string matching



<h3> Data Munging </h3>

In [2]:
with open('feedback-clean.csv','r') as feedback :
    mylist = list(feedback)

In [3]:
def to_string(text) :
    return ' '.join(text)

In [4]:
def clean_file(text):
    
    clean = re.sub("[^A-Za-z']+",' ',text)
    
    return clean

In [5]:
text = to_string(mylist)

In [6]:
text = clean_file(text)

In [7]:
#removing non english words
words = set(nltk.corpus.words.words())
sent = text
text_en = " ".join(w for w in nltk.wordpunct_tokenize(sent) 
         if w.lower() in words or not w.isalpha())

In [8]:
sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_en)]

### Word Embedding -> GloVe + Word2Vec

#### Downloading pre-trained GloVe embeddngs for better accuracy 

In [None]:
!wget 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

In [None]:
!unzip 'glove.840B.300d.zip'

#### Saving the GloVe embeddings in KeyedVectors Gensim 

In [13]:
glove2word2vec(glove_input_file="glove.840B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

2020-09-01 10:45:08,516 : INFO : converting 2196017 vectors from glove.840B.300d.txt to gensim_glove_vectors.txt


(2196017, 300)

In [29]:
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

2020-09-01 11:01:47,029 : INFO : loading projection weights from gensim_glove_vectors.txt
2020-09-01 11:11:17,732 : INFO : duplicate words detected, shrinking matrix size from 2196017 to 2196016
2020-09-01 11:11:17,733 : INFO : loaded (2196016, 300) matrix from gensim_glove_vectors.txt


<h3> Skills </h3>

In [9]:
df = pd.read_csv('skills-en.csv')

In [53]:
#A list of skills 
skills = list(df['name'].dropna())

In [None]:
train = pd.read_csv('train.csv')

### Traning on custom curpos 

In [11]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

In [None]:
#Traning a model on our own curpos + a kaggle employee review dataset
glove_model.train(sentences, total_examples = len(sentences), epochs = 100)

In [62]:
sample = 'This guy is good at wordpress'

In [65]:
words = re.findall(r'\w+', sample)
words = [word.capitalize() for word in words]

In [66]:
#string matching : 
for word in words :
    for skill in skills :
        if word==skill :
            print(f'Skill found : {skill}\n')
            print(f'Related to this skill :\n   {glove_model.most_similar(word)}\n')

Skill found : Wordpress

Related to this skill :
   [('WordPress', 0.9231514930725098), ('wordpress', 0.7973741888999939), ('Joomla', 0.7858673930168152), ('Plugin', 0.6961660981178284), ('Drupal', 0.6871516704559326), ('WordPress.com', 0.6838994026184082), ('Blogger', 0.6580688953399658), ('WP', 0.6452423334121704), ('Plugins', 0.6435956954956055), ('Magento', 0.641252875328064)]

