In [2]:
#loading the dataset and analysis the field of the dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.metrics import accuracy_score

resumeDataSet = pd.read_csv('UpdatedResumeDataSet.csv', encoding = 'utf-8')
resumeDataSet.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
#Idendtifying all the resume categories in the dataset

print ("Displaying the distinct categories of resume -")
print (resumeDataSet['Category'].unique())

Displaying the distinct categories of resume -
['Data Science' 'HR' 'Advocate' 'Arts' 'Web Designing'
 'Mechanical Engineer' 'Sales' 'Health and fitness' 'Civil Engineer'
 'Java Developer' 'Business Analyst' 'SAP Developer' 'Automation Testing'
 'Electrical Engineering' 'Operations Manager' 'Python Developer'
 'DevOps Engineer' 'Network Security Engineer' 'PMO' 'Database' 'Hadoop'
 'ETL Developer' 'DotNet Developer' 'Blockchain' 'Testing']


In [3]:
#Displaying the distinct categories of resume and the number of records belonging to each category -

print ("Displaying the distinct categories of resume and the number of records belonging to each category -")
print (resumeDataSet['Category'].value_counts())

Displaying the distinct categories of resume and the number of records belonging to each category -
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Mechanical Engineer          40
ETL Developer                40
Blockchain                   40
Data Science                 40
Operations Manager           40
Sales                        40
Arts                         36
Database                     33
Health and fitness           30
Electrical Engineering       30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: Category, dtype: int64


In [4]:
#pre processing function to clean reusme removing URL,HASH TAGS, PUNCTUATIONS,etc

import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def cleanResume(resumeText):
      
    lemmatizer = WordNetLemmatizer()
    stemming = PorterStemmer()
    resumeText = resumeText.lower()
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub(r'[0-9]',' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    word_tokens = word_tokenize(resumeText) 
    resumeText = [w for w in word_tokens if not w in stopwords.words("english")]
    resumeText = ' '.join(map(str, resumeText))
    resumeText = stemming.stem(resumeText)
    resumeText = lemmatizer.lemmatize(resumeText)
    return resumeText
  


In [36]:
#Pre Processing the resumes
print("Pre processing of resume in progress.......")
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
print("Pre-processing DONE !!!!")

Pre processing of resume in progress.......
Pre-processing DONE !!!!


In [34]:
#Making tf idf vectors and also splitting the data to train and test

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import pickle

requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)
with open('tf_idf', 'wb') as tfidf:
  pickle.dump(word_vectorizer, tfidf)


print ("Feature completed .....")

X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=0, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

Feature completed .....
(769, 1500)
(193, 1500)


In [35]:
# Training the SVM classifier model
# Printing the classification report and accuracy

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(classification_report(y_test,y_pred))
print("Accuracy:",svclassifier.score(X_test, y_test))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         3
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         9
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         9
                 Database       1.00      1.00      1.00         7
          DevOps Engineer       1.00      1.00      1.00        11
         DotNet Developer       1.00      1.00      1.00         9
            ETL Developer       1.00      1.00      1.00         8
   Electrical Engineering       1.00      1.00      1.00         9
                       HR       1.00      1.00      1.00         5
                   Hadoop       1.00      1.00      1.00     

In [28]:
#Saving the trained model in the file so can be used in the other program to cateogrise the cv in computer and finding relevant cv basis of job description

import joblib

joblib.dump(svclassifier,'svm_trained_model')

['svm_trained_model']

In [29]:
# Training the Logistic Regression model
# Printing the classification report and accuracy

from sklearn.linear_model import LogisticRegression

Logistic_model = LogisticRegression()
Logistic_model = Logistic_model.fit(X_train,y_train)
y_preds = Logistic_model.predict(X_test)
print(classification_report(y_test,y_pred))
print("Accuracy:",Logistic_model.score(X_test,y_test))


                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         3
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         9
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         9
                 Database       1.00      1.00      1.00         7
          DevOps Engineer       1.00      1.00      1.00        11
         DotNet Developer       1.00      1.00      1.00         9
            ETL Developer       1.00      1.00      1.00         8
   Electrical Engineering       1.00      1.00      1.00         9
                       HR       1.00      1.00      1.00         5
                   Hadoop       1.00      1.00      1.00     

In [31]:
#Saving the trained model in the file so can be used in the other program to cateogrise the cv in computer and finding relevant cv basis of job description

import joblib

joblib.dump(Logistic_model,'Logistic_trained_model')


['Logistic_trained_model']