## Resume Screening with Python

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
resumeDataset = pd.read_csv("UpdatedResumeDataSet.csv")
resumeDataset['cleaned_resume'] = ''
resumeDataset.head(10)

In [None]:
print("Categories of the resume - ")
print(resumeDataset['Category'].unique())

In [None]:
print("Categories of resume and the number of records belongs to - ")
print(resumeDataset['Category'].value_counts())

## Visualize the number of Categories in the Dataset

In [None]:
plt.figure(figsize=(15, 35))
plt.xticks(rotation=90)
sns.countplot(y="Category", data=resumeDataset)

## Visualize the distribution of the dataset

In [None]:
from matplotlib.gridspec import GridSpec
targetCounts = resumeDataset['Category'].value_counts()
targetLabels = resumeDataset['Category'].unique()

plt.figure(1, figsize=(25, 25))
the_grid = GridSpec(2, 2)

cmap = plt.get_cmap('coolwarm')
colors = cmap(np.linspace(0, 1, 3))

plt.subplot(the_grid[0, 1], aspect=1, title='CATEGORY DISTRIBUTION')
source_pie = plt.pie(targetCounts, labels=targetLabels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()

In [None]:
import re
import nltk
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

nltk.download('stopwords')

def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)
    resumeText = re.sub('RT|cc', ' ', resumeText)
    resumeText = re.sub('#\S+', ' ', resumeText)
    resumeText = re.sub('@\S+', ' ', resumeText)
    resumeText = re.sub('[%s]' % re.escape("""!""#@&'()*+,-./;:<=>?@^_{}|~"""), ' ', resumeText)
    resumeText = re.sub(r'[^\x00-\x7f]', r'', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)
    return resumeText

resumeDataset['cleaned_resume'] = resumeDataset['Resume'].apply(cleanResume)
cleanedSentences = ' '.join(resumeDataset['cleaned_resume'])
oneSetofStopWords = set(stopwords.words('english') + ['``', "''"])
requirementWords = word_tokenize(cleanedSentences)
filteredWords = [word for word in requirementWords if word not in oneSetofStopWords and word not in string.punctuation]


wordfreqdist = FreqDist(filteredWords)
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)

wc = WordCloud().generate(cleanedSentences)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    resumeDataset[i] = le.fit_transform(resumeDataset[i])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

requiredText = resumeDataset['cleaned_resume'].values
requiredTarget = resumeDataset['Category'].values

word_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=1500)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

print("Feature completed.............")

x_train, x_test, y_train, y_test = train_test_split(WordFeatures, requiredTarget, random_state=0, test_size=0.2)
print(x_train.shape)
print(x_test.shape)

In [None]:
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(x_train, y_train)
prediction = clf.predict(x_test)
print('Accuracy of the classifier on training set: {:.2f}'.format(clf.score(x_train, y_train)))
print('Accuracy of the classifier ontesting set: {:.2f}'.format(clf.score(x_test, y_test)))

print("\n Classification Report for Classifier %s:\n%s\n"%(clf, metrics.classification_report(y_test, prediction)))