# IMPORTING PACKAGES AND LOADING DATA

In [2]:
#Import the required packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns # visualization
import matplotlib.pyplot as plt
import string
import re

'''Features'''
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
import nltk #importing the natural language toolkit
from nltk.corpus import stopwords

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

'''Metrics/Evaluation'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from wordcloud import WordCloud # Generating word cloud as per the frequency of occurrence.

'''Plotting'''
from sklearn.pipeline import Pipeline
from matplotlib.gridspec import GridSpec

import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"


In [3]:
data = pd.read_csv('/kaggle/input/resumedata/UpdatedResumeDataSet.csv')
data.head()

# EXPLORATORY DATA ANALYSIS

In [4]:
print(data['Category'].unique()) #Different roles

In [5]:
print(data['Category'].value_counts()) #Number of applications for each particular role

In [6]:
print(len(data['Category'])) #Number of applications

In [7]:
plt.figure(figsize=(12,12))
plt.xticks()
sns.countplot(data=data,y='Category', palette="icefire")

In [8]:
Counts = data['Category'].value_counts()
Labels = data['Category'].unique()

plt.figure(1, figsize=(25,25))
grid = GridSpec(2,2)

cmap = plt.get_cmap('spring')
color = [cmap(i) for i in np.linspace(0,1,3)]
plt.subplot(grid[0,1], aspect = 1, title='CATEGORY DISTRIBUTION')

source_pie =plt.pie(Counts, labels = Labels, autopct = '%1.2f%%', colors = color)
plt.show()

In [9]:
Counts = data['Category'].value_counts()
Labels = data['Category'].unique()

plt.figure(1, figsize=(25,25))

plt.plot(Labels,Counts)

# TEXT PREPROCESSING

In [10]:
print(string.punctuation)

In [11]:
def CleanResume(new):
    #col = data['Resume']
    new = new.lower() # Convert the whole column into lowercase.
    new = new.translate(str.maketrans('','',string.punctuation)) #Remove Punctuations from the sentences.
    new = re.sub('http\S+\s*', ' ', new) # remove URLs
    new = re.sub('RT|cc', ' ', new) # remove RT and cc
    new = re.sub(r'[^\x00-\x7f]',r' ', new) 
    new = re.sub('\s+', ' ', new)  # remove extra whitespace
    
    return new

data['Clean_Resume'] = data['Resume'].apply(lambda x: CleanResume(x))

In [12]:
print(data['Clean_Resume'][6])

In [13]:
print("Total Number of words in the Cleaned Data is :", data['Clean_Resume'].apply(lambda x: len(x.split(' '))).sum())

In [14]:
#Avg word count by category
def word_count(text):
    return len(str(text).split(' '))

data['word_count'] = data['Clean_Resume'].apply(word_count)
avg_wc = data.groupby('Category').mean().reset_index()
avg_wc[['Category','word_count']]

In [15]:
s_w = stopwords.words('english')
punctuation = string.punctuation
totalWords = []
cleaned_sentences = ""

for sentence in data['Clean_Resume']:
    words = nltk.word_tokenize(sentence)
    cleaned_sentences += sentence
    for word in words:
        if word not in s_w and word not in punctuation:
            totalWords.append(word)
            
wordDist = nltk.FreqDist(totalWords)
mostCommon = wordDist.most_common(50)
print(mostCommon)

In [16]:
w_c = WordCloud(colormap='hsv_r').generate(cleaned_sentences)
plt.figure(figsize=(15,15))
plt.imshow(w_c, interpolation='bilinear')
plt.axis("off")
plt.show()

In [17]:
le = LabelEncoder()
#le.fit(data['Category'])
data['Category_Index'] = le.fit_transform(data['Category'])

In [18]:
data.sample(5)

In [19]:
category_unique = data[['Category', 'Category_Index']].drop_duplicates()
#print(category_unique)

In [20]:
# Dictionaries for future use
category_to_index = dict(category_unique.values)
index_to_category = dict(category_unique[['Category_Index', 'Category']].values)

**Spliting the data into train and test sets**
* The original data was divided into features (X) and target (y), which were then splitted into train (80%) and test (20%) sets. Thus, the algorithms would be trained on one set of data and tested out on a completely different set of data (not seen before by the algorithm).

In [21]:
# tfidf Vectorizer transforms text to feature vectors that are used as input
# Measure of originality of a word.

text = data['Clean_Resume'].values.tolist()
new_text = np.array(text)
target = data['Category_Index'].values

vect = TfidfVectorizer(sublinear_tf = True, min_df=5, ngram_range=(1, 2), stop_words = 'english', max_features = 1500)
WordFeatures = vect.fit_transform(text)

x_train, x_test, y_train, y_test = train_test_split(WordFeatures, target, random_state=0, test_size = 0.2)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


In [22]:
# Finding the two most correlated terms with each of the product categories
N = 2
for Category, Category_Index in sorted(category_to_index.items()):
  features_chi2 = chi2(WordFeatures, target == Category_Index)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(vect.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("==> %s:" %(Category))
  print("    Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("    Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))

# PRELIMINARY MODEL EVALUATION FOR DIFFERENT TYPES OF MODELS USING DEFAULT PARAMETERS

**Multi-Class Classification models**
> The classification models evaluated are:

> * K Nearest Neighbor
> * Dummy Classifier
> * Linear Support Vector Classifier
> * Stochastic Gradient Descent
> * Random Forest
> * Decision Tree
> * AdaBoost
> * Gradient Boost
> * Multinomial Naive Bayes Classifier


In [23]:
#Creating a dictionary of the models
model_dict = {'K Nearest Neighbor': KNeighborsClassifier(),
              'Dummy' : DummyClassifier(random_state=3),
              'Linear Support Vector' : LinearSVC(),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Multinomial Naive Bayes' : MultinomialNB(),
              'GradientBoost' : GradientBoostingClassifier()}

#Train test split with stratified sampling for evaluation

#Function to get the accuracy/scores for each model
def model_score_data(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(x_train, y_train)
        y_pred = v.predict(x_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_data = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_data.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_data = model_comparison_data.sort_values(by='f1_score', ascending=False)
    return model_comparison_data

print(model_score_data(model_dict))

In [24]:
plt.figure(figsize=(22,16))
sns.boxplot(x='model_name', y='accuracy_score', 
            data=model_score_data(model_dict), 
            color='orange', 
            showmeans=True)
plt.title("ACCURACY COMPARISON", size=16);

In [25]:
plt.figure(figsize=(22,16))
x = model_score_data(model_dict).model_name
y = model_score_data(model_dict).accuracy_score
plt.xlabel("Model Name",size=12)
plt.ylabel("Accuracy",size=12)
plt.title("ACCURACY COMPARISON")
plt.plot(x,y, marker = 'D',color = 'r')
plt.show()

# **EVALUATING DIFFERENT MODELS - RESULT VISUALIZATION** 

# K-NEAREST NEIGHBORS CLASSIFIER

In [26]:
# Implementation of K-Nearest Neighbors Classification algorithm

model = KNeighborsClassifier()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
#print(prediction)

In [27]:
# Visualizing the accuracy of our model

print("Training Set Accuracy [KNN] : ", model.score(x_train, y_train))
print("Test Set Accuracy [KNN] : ", model.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, prediction, target_names=Labels))

In [28]:
# Predicting the Class (Result)
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = model.predict(r)[0]
    return profile[num]

In [29]:
resume = ["Research enthusiast in advance multirotor and fixed wing autonomous algorithms, machine learning, and deep learning. Skilled in Python, and everyday pushing to learn more."]

profilePredict(resume)

In [30]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, prediction)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')

**VISUALIZING THE RESULTS BY PLOTTING OPTIMAL K-VALUE AGAINST ERROR RATE**

In [31]:
 error_rate=[]
 for i in range(1,50):
             knn = KNeighborsClassifier(n_neighbors=i)
             model = knn.fit(x_train,y_train)
             pred_i = knn.predict(x_test)
             error_rate.append(np.mean(pred_i != y_test))
 plt.figure(figsize=(16,12))
 plt.plot(range(1,50), error_rate, linestyle = 'dotted', marker = 'o',color = 'b')
 plt.xlabel('K Value')
 plt.ylabel('Error Rate')
 plt.title('K Value Vs Error Rate')
 plt.show() 

# MULTINOMIAL NAIVE BAYES ALGORITHM

In [32]:
clf = MultinomialNB()
clf.fit(x_train, y_train)
pred_MNB = clf.predict(x_test)

In [33]:
print("Training Set Accuracy [MNB] : ", clf.score(x_train, y_train))
print("Test Set Accuracy [MNB] : ", clf.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, pred_MNB, target_names=Labels))

In [34]:
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = clf.predict(r)[0]
    return profile[num]

resume = ["Research enthusiast in advance multirotor and fixed wing autonomous algorithms, machine learning, and deep learning. Skilled in Python, and everyday pushing to learn more."]
profilePredict(resume)

In [35]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, pred_MNB)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')

# DECISION TREE CLASSIFIER ALGORITHM

In [36]:
new_model = DecisionTreeClassifier()
new_model.fit(x_train, y_train)
pred_DTC = new_model.predict(x_test)

In [37]:
print("Training Set metrics Accuracy [Random Forest] : ", new_model.score(x_train, y_train))
print("Test Set Accuracy [Random Forest] : ", new_model.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, pred_DTC, target_names=Labels))

In [38]:
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = new_model.predict(r)[0]
    return profile[num]

resume = ["Love to invest in various crypto currencies and have knack of various programming languages required for blockchain development"]
profilePredict(resume)

In [39]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, pred_DTC)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')

# STOCHASTIC GRADIENT DESCENT 

In [40]:
sgd = Pipeline([('tfidf', TfidfTransformer()), 
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])

sgd.fit(x_train, y_train)
pred_sgd = sgd.predict(x_test)

In [41]:
print("Training Set metrics Accuracy [LSVM] : ", sgd.score(x_train, y_train))
print("Test Set Accuracy [LSVM] : ", sgd.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, pred_sgd, target_names=Labels))

In [42]:
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = sgd.predict(r)[0]
    return profile[num]

resume = ["Love to invest in various crypto currencies and have knack of various programming languages required for blockchain development"]
profilePredict(resume)

In [43]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, pred_sgd)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')

# GRADIENT BOOSTING

In [44]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster.get_params()

In [45]:
gradient_booster.fit(x_train,y_train)
pred_GB = gradient_booster.predict(x_test)
print("Training Set metrics Accuracy [GB] : ", gradient_booster.score(x_train, y_train))
print("Test Set Accuracy [GB] : ", gradient_booster.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, pred_GB, target_names=Labels))

In [46]:
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = gradient_booster.predict(r)[0]
    return profile[num]

resume = ["Love to invest in various crypto currencies and have knack of various programming languages required for blockchain development"]
profilePredict(resume)

In [47]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, pred_GB)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')

# ADA_BOOST CLASSIFIER

In [48]:
# Creating adaboost classifier model
adb = AdaBoostClassifier()
adb.fit(x_train,y_train)
pred_ADB = gradient_booster.predict(x_test)

In [49]:
print("Training Set metrics Accuracy [GB] : ", adb.score(x_train, y_train))
print("Test Set Accuracy [GB] : ", adb.score(x_test, y_test))
print("\nClassification Report\n", classification_report(y_test, pred_ADB, target_names=Labels))

In [50]:
def profilePredict(resume):
    profile = list(le.classes_)
    r = vect.transform(resume)
    num = adb.predict(r)[0]
    return profile[num]

resume = ["Love to invest in various crypto currencies and have knack of various programming languages required for blockchain development"]
profilePredict(resume)

In [51]:
#Plotting the confusion matrix
conf = confusion_matrix(y_test, pred_ADB)

plt.figure(figsize=(25,20))
sns.heatmap(conf, annot=True,
            xticklabels=category_unique.Category.values, 
            yticklabels=category_unique.Category.values)

plt.title('Confusion Matrix', size =16)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')