In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('../input/resumedataset/resume_dataset.csv')
df.head()

In [3]:
df.Category.value_counts()

In [4]:
df.info()

In [5]:
import seaborn as sns
plt.figure(figsize=(8,6))
plt.xticks(rotation=90)
sns.countplot(y="Category" , data=df);

In [6]:
df["Resume"][10]

In [7]:
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) # remove non-ascii characters
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [8]:
df["Cleaned Resume"] = df.Resume.apply(lambda x: cleanResume(x))
df["Cleaned Resume"][0]

In [9]:
df.head()

In [10]:
from string import punctuation
print(punctuation)

In [11]:
import nltk
#nltk.download()
import string
from nltk.corpus import stopwords
from nltk import word_tokenize

paragraph=""
total_words = []
for sentence in df["Cleaned Resume"] :
    paragraph+= sentence
    words = word_tokenize(sentence)
    for word in words :
         if word not in set(stopwords.words('english')) and word not in string.punctuation:
            total_words.append(word)


In [12]:
freq_word = nltk.FreqDist(total_words)
freq_word.most_common(10)

In [13]:
from wordcloud import WordCloud
wc = WordCloud().generate(paragraph)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Labels']=le.fit_transform(df.Category)

In [15]:
df.sample(5)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

text = df["Cleaned Resume"].values
labels = df["Labels"].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)

vec_df = word_vectorizer.fit_transform(text)
vec_df.shape

In [17]:
X_train,X_test,y_train,y_test = train_test_split(vec_df,labels,random_state=0, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


model = OneVsRestClassifier(KNeighborsClassifier())

model.fit(X_train,y_train)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(model.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))

In [19]:
from sklearn.naive_bayes import MultinomialNB

clf = OneVsRestClassifier(MultinomialNB()).fit(X_train, y_train)
prediction = clf.predict(X_test)
print('Accuracy of MultinomialNB Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of MultinomialNB Classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
model = OneVsRestClassifier(DecisionTreeClassifier())

model.fit(X_train,y_train)
print('Accuracy of DecisionTree Classifier on training set: {:.2f}'.format(model.score(X_train, y_train)))
print('Accuracy of DecisionTree Classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))