In [793]:
import pandas as pd
import numpy as np
import spacy
import math
import os
import time
import gensim
import sys
import subprocess
import json
import tika
import re
import docx
from spacy.language import Language
from tika import parser
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Function to load Default spacy nlp model and starting the tika server

In [794]:
def loadDefaultNLP(is_big: bool = True):
    """
    Function to load the default SpaCy nlp model into nlp
    :Input is_big: if True, uses a large vocab set, else a small one
    :returns: nlp: a SpaCy nlp model
    """

    @Language.component("segment_on_newline")
    def segment_on_newline(doc):
        for token in doc[:-1]:
            if token.text.endswith("\n"):
                doc[token.i + 1].is_sent_start = True
        return doc

    if is_big:
        nlp = spacy.load("en_core_web_lg")
    else:
        nlp = spacy.load("en_core_web_sm")

    Language.component("segment_on_newline",func=segment_on_newline)
    nlp.add_pipe("segment_on_newline",before="parser")
    return nlp

In [795]:
nlp = loadDefaultNLP(False)
tika.initVM()
from tika import parser
print(nlp)

<spacy.lang.en.English object at 0x000001CD048FF550>


# Get PDF/Doc names from the input directories where resumes are stored

In [796]:
def findDocumentsRecursive(base_dir):
    """
    Recursively get all documents from `base_dir`
    :Input base_dir: base directory of documents
    :returns out: a list of full file names of the documents
    """
    out = []

    # check if base_dir is a proper dir
    if not os.path.isdir(base_dir):
        return None

    for d in os.listdir(base_dir):
        full_path = os.path.join(base_dir, d)
        if os.path.isdir(full_path):
            out.extend(findDocumentsRecursive(full_path))
        else:
            for end in (".pdf", ".docx"):
                if full_path.endswith(end):
                    out.append(full_path)
                    
    #print(out)
    return out


In [797]:
pdfs = findDocumentsRecursive(r"D:\CV\original cv")

In [798]:
len(pdfs)

113

In [799]:
skills = []
with open(r"D:\Projects\Untitled Folder\skills.txt.txt",'r') as fin:
    skills.append(fin.readline().split(','))

#print(skills)

In [800]:
def getDocxText(filename):
    """
    Get the text from a docx file
    :param filename: docx file
    :returns fullText: text of file
    """
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        txt = para.text
        fullText.append(txt)
    return "\n".join(fullText)


def getPDFText(filename, parser):
    """
    Get the text from a pdf file
    :param filename: pdf file
    :param parser: pdf parser
    :returns fullText: text of file
    """
    raw = parser.from_file(filename)
    #print("Raw is ", raw)
    #print("Content is ", raw["content"])
    #print("Metadata is ", raw["metadata"])
    new_text = raw["content"]
    if "title" in raw["metadata"]:
        title = raw["metadata"]["title"]
        new_text = new_text.replace(title, "")
        
    return new_text


In [801]:
def loadDocumentIntoSpacy(f, parser, spacy_nlp):
    """
    Convert file into spacy Document
    :param f: filename
    :param parser: pdf_parser
    :param spacy_nlp: nlp model
    :returns nlp_doc: nlp doc
    :returns new_text: text of file
    """
    if f.endswith(".pdf"):
        new_text = getPDFText(f, parser)
    elif f.endswith(".docx"):
        new_text = getDocxText(f)
    else:
        return None, None
    
    new_text = re.sub("\n{3,}", "\n", new_text)  #Replaces 3 empty lines with 1
    new_text = str(bytes(new_text, "utf-8").replace(b"\xe2\x80\x93", b""), "utf-8")      #Replaces - in the text with ""
    
    #print(spacy_nlp(new_text))
    #print("\n",new_text)
    # convert to spacy doc
    return spacy_nlp(new_text),new_text


In [802]:
def getAllTokensAndChunks(doc):
    """
    Converts a spacy doc into tokens and chunks
    :Input doc: a SpaCy doc
    :returns: seen_chunks_words: set of strings seen
    :returns: all_tokens_chunks: set of all tokens and chunks found
    """
    # used to test duplicate words/chunks
    seen_chunks_words = set()
    # collect all words/chunks
    all_tokens_chunks = set()
    # generate all 1-gram tokens
    for token in doc:
        w = token.lemma_.lower()
        if (w not in seen_chunks_words):
            all_tokens_chunks.add(token)
            seen_chunks_words.add(w)

    # generate all n-gram tokens
    #print("\n Doc noun is ",doc.noun_chunks)
    for chunk in doc.noun_chunks:
        c = chunk.lemma_.lower()
        #print("\n Smaller chunk is", c)
        if (
            len(chunk) > 1
            and (c not in seen_chunks_words)
        ):
            all_tokens_chunks.add(chunk)
            seen_chunks_words.add(c)

    return seen_chunks_words, all_tokens_chunks


# Creating data from the fetched details

In [803]:
print("Getting resume tokens and chunks...")
pdf_data = defaultdict(list)
for p in pdfs:
    # convert to spacy doc
    doc, _ = loadDocumentIntoSpacy(p, parser, nlp)
    seen_chunks_words, all_tokens_chunks = getAllTokensAndChunks(doc)
    pdf_data["Category"].append(os.path.basename(os.path.dirname(p)))
    seen_chunks_words.update(all_tokens_chunks)
    pdf_data["Resume"].append(list(seen_chunks_words))
    print(".",end ="")
       
pdf_df = pd.DataFrame(data=pdf_data)
pdf_df["Resume"] =  [' '.join(map(str, l)) for l in pdf_df["Resume"]]
print("\n Data Populated")


Getting resume tokens and chunks...
.................................................................................................................
 Data Populated


In [807]:
pdf_df.to_csv(r'D:\Projects\Untitled Folder\our_data.csv',index=False)

In [808]:
more_data = pd.read_csv(r"D:\Projects\Untitled Folder\UpdatedResumeDataSet.csv")

In [809]:
df = pdf_df.append(more_data, ignore_index=True)

In [810]:
df["Resume"][67]

''

In [811]:
df = df.dropna().reset_index()

In [812]:
for i in range(len(df)):
    review = re.sub('http[s]?://\S+', '', str(df["Resume"][i]))
    review = review.lower()
    review = re.sub('[^a-zA-Z&.]', ' ', review)
    df["Resume"][i] = review

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Resume"][i] = review


In [813]:
skills = []
with open("skills.txt.txt",'r') as f:
   # perform file operations
    for word in f.readline().strip().split(','):
        word=re.sub('[^a-zA-Z]', ' ', str(word))
        skills.append(word)
        
skills_updated=[]
for word in skills:
    new_words=word.split()
    skills_updated.append(new_words)

print('************************************* Skill reading done *************************************************************')

final_words=[]
df["Review"] = ''
for i in range(len(df)):
    for word in skills_updated:
        concat_word = ""
        for j in word:
            if j in df["Resume"][i]:
                concat_word = concat_word + j  
                
            #print("Concat word is ",concat_word)   
        if concat_word != "":
            final_words.append(concat_word)
            df["Review"][i] = df["Review"][i]+" "+concat_word

with open('intermediate.txt', 'w') as f:
    for item in final_words:
        f.write("%s\n" % item)



************************************* Skill reading done *************************************************************


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U1')) -> None

In [780]:
df = df[["Category","Resume","Review"]]

In [781]:
df_bkp = df.copy()

In [782]:
for i in range(len(df)):
    df["Review"][i]=' '.join(map(str,[i for i in list(set(df['Review'][i].split(' '))) if len(i) > 2]))

In [783]:
df.head(5)

Unnamed: 0,Category,Resume,Review
0,Data Engineer,data medical linkedin months month institu...,data tech cprogramminglanguage api machinelear...
1,Data Engineer,data an linkedin scientist adept university...,data tech manage managementexperience microsof...
2,Data Engineer,data to linkedin fruitwala a data engineer up ...,googleapi data lean center api shell services ...
3,Data Engineer,my education linkedin analysis implement boo...,data flow elastic analysis python eve start an...
4,Data Engineer,data desktop linkedin records month visualiz...,data you microsoft python and desktop desk exp...


In [784]:
df_bkp.to_csv(r'D:\Projects\Untitled Folder\resume_data.csv',index=False)
df.to_csv(r'D:\Projects\Untitled Folder\no_duplicates_data.csv',index=False)

In [785]:
df=df[["Category","Review"]]

In [None]:
df["Review"].isna().sum()

In [None]:
df.dropna().reset_index()

In [786]:

df['Rating'] = 0
for i in range(len(df)):
    count = 0
    text = df["Review"][i].split()
    for word in range(len(text)):
        if text[word] in skills:
            count+=1
    cal = (count/len(skills)) * 200
    if (cal) > 5:
        rating = 5
    elif cal < 1:
        rating = 1
    else:
        rating = cal
    df["Rating"][i] = rating

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Rating"][i] = rating


In [787]:
df.head(5)

Unnamed: 0,Category,Review,Rating
0,Data Engineer,data tech cprogramminglanguage api machinelear...,1
1,Data Engineer,data tech manage managementexperience microsof...,1
2,Data Engineer,googleapi data lean center api shell services ...,1
3,Data Engineer,data flow elastic analysis python eve start an...,4
4,Data Engineer,data you microsoft python and desktop desk exp...,2


In [752]:
df["Rating"].value_counts()

5    323
1    321
3    205
2    133
4     93
Name: Rating, dtype: int64

In [753]:
df.to_csv(r'D:\Projects\Untitled Folder\final_data.csv',index=False)

In [754]:
df = pd.read_csv(r'D:\Projects\Untitled Folder\final_data.csv')

In [755]:
df.head()

Unnamed: 0,Category,Review,Rating
0,Data Engineer,data tech cprogramminglanguage api machinelear...,1
1,Data Engineer,data tech manage managementexperience microsof...,1
2,Data Engineer,googleapi data lean center api shell services ...,1
3,Data Engineer,data flow elastic analysis python eve start an...,4
4,Data Engineer,data you microsoft python and desktop desk exp...,2
