In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')

In [2]:
#Importing complete and labled job summary data
complete_data = pd.read_csv('complete_labeled_job_summary_data.csv',encoding = "ISO-8859-1")
complete_data = complete_data.drop('Unnamed: 0',axis=1)

#converting text data into lower case letters in order to process better
complete_data["job summary"]=complete_data["job summary"].str.lower()

In [3]:
#Creating a combined list of all the words in job summary for training data

#Creating empty list for iterations and other for words in job summary
iterator = []
total_words_list=[]

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(complete_data.axes[0]),1):
    iterator.append(i)
    
#creating the list
for i in iterator:
    job_summary=complete_data.get_value(i,'job summary')
    words = re.split(r"\s", job_summary)
    total_words_list.append(words)

#Combining all the nested arrays to form one single array    
total_words_list = [y for x in total_words_list for y in x]


In [4]:
#Create a string from the array of the words for training data
total_string = ' '.join(w for w in total_words_list)


In [5]:
#Convert string to nltk.text format for training data
tokens = nltk.word_tokenize(total_string)      # Convert str to token
nltk_words = nltk.text.Text(tokens)            # convert tokens to nltk.text.Text()


In [6]:
#Removing stop words
#identifying stopwords in english
stopwords = nltk.corpus.stopwords.words('english')

#Remove the stop words from list of words for training data
content = [w for w in nltk_words if w not in stopwords]


In [7]:
#Finding out features out of content for training data
#Finding out frequency of words
freq_words = nltk.FreqDist(w for w in content)

#Finding features
num_features = 5000
word_features = list(freq_words)[:num_features]
print(word_features)
type(word_features)




list

In [8]:
#Converting strings in train data into nltk.text.Text format in order to use NLP
#Creating empty list for iterations
iterator = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(complete_data.axes[0]),1):
    iterator.append(i)

#Converting and replacing the strings with nltk.text    
for i in iterator:
    job_summary=complete_data.get_value(i,'job summary')  # Pull just the job summary as a str
    tokens = nltk.word_tokenize(job_summary)      # Convert str to token
    text_obj = nltk.text.Text(tokens)             # convert tokens to nltk.text.Text()
    complete_data.set_value(i,'job summary',text_obj)     # Now all of the functions from lab 17 will work with this obj



In [9]:
#Creating a list of tuples which combine words in job summary and its label
for i in range (0,len(complete_data.axes[0]),1):
    iterator.append(i)
    
total_job_listings_and_labels = []
for i in iterator:
    job_listings_and_labels = tuple([list(complete_data.get_value(i,"job summary")),
                                    complete_data.get_value(i,"label")])
    total_job_listings_and_labels.append(job_listings_and_labels)


In [10]:
def document_features(document):
    # Note: checking whether a word occurs in a set is much faster 
    # than checking whether it occurs in a list     
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in total_job_listings_and_labels]

In [11]:
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(100)

0.85
Most Informative Features
         contains(laugh) = True             arts : commer =    262.3 : 1.0
         contains(rehab) = True           sports : scienc =    138.3 : 1.0
           contains(asl) = True           servic : scienc =    127.5 : 1.0
contains(reconciliations) = True           commer : arts   =    116.0 : 1.0
        contains(pepper) = True           servic : sports =     99.7 : 1.0
       contains(seating) = True           servic : commer =     85.2 : 1.0
  contains(conservative) = True           servic : scienc =     83.3 : 1.0
           contains(ace) = True           sports : servic =     77.3 : 1.0
         contains(drink) = True           servic : scienc =     76.4 : 1.0
  contains(microbiology) = True           scienc : commer =     73.2 : 1.0
 contains(ã¢ââ¢assists) = True             arts : scienc =     72.1 : 1.0
     contains(athletics) = True           sports : scienc =     62.0 : 1.0
       contains(utility) = True           scienc : arts   =     53.2

In [13]:
job_index=99
print(complete_data.iloc[job_index][0])
print('Model Prediction: '+classifier.classify(test_set[job_index][0])+'\n')
for (k,v) in test_set[job_index][0].items():
    if v==True:
        print(k)

OMB Financial Manager--Accounting Division
Model Prediction: commerce

contains(financial)
contains(functions)
contains(reconciliation)
contains(operational)
contains(projects)
contains(punctual)
contains(certified)
contains(public)
contains(effectiveness)
contains(statements)
contains(provides)
contains(may)
contains(report)
contains(standards)
contains(supervises)
contains(compilation)


In [14]:
new_save = True
if new_save:
    import pickle
    f = open('nb_classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()