In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import nltk
import re
from nltk.corpus import brown
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')

In [2]:
#Importing complete and labled job summary data
complete_data = pd.read_csv('complete_labeled_job_summary_data.csv',encoding = "ISO-8859-1")
complete_data = complete_data.drop('Unnamed: 0',axis=1)

#converting text data into lower case letters in order to process better
complete_data["job summary"]=complete_data["job summary"].str.lower()

In [3]:
#Splitting the data into test and train set and resetting their index
train, test = train_test_split(complete_data, test_size = 0.2)
train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

In [4]:
#Converting strings in test data into nltk.text.Text format in order to use NLP

#Creating empty list for iterations
iterator = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(test.axes[0]),1):
    iterator.append(i)

#Converting and replacing the strings with nltk.text    
for i in iterator:
    job_summary=test.get_value(i,'job summary')  # Pull just the job summary as a str
    tokens = nltk.word_tokenize(job_summary)      # Convert str to token
    text_obj = nltk.text.Text(tokens)             # convert tokens to nltk.text.Text()
    test.set_value(i,'job summary',text_obj)     # Now all of the functions from lab 17 will work with this obj

In [5]:
#exploring train set
print(train.info())
print('\n',train.describe())
print('\n',train['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 3 columns):
job title      7294 non-null object
job summary    7294 non-null object
label          7294 non-null object
dtypes: object(3)
memory usage: 85.5+ KB
None

        job title                                        job summary    label
count       7294                                               7294     7294
unique      4250                                               5271        5
top         Host  be a part of what's trending at today's chili'...  science
freq         264                                                107     1943

 science     1943
commerce    1835
arts        1387
services    1137
sports       992
Name: label, dtype: int64


In [6]:
#Creating dummy variables for catagories
train["commerce"] = train["label"].map({'commerce':1, 'science':0, 'arts':0, 'services':0, 'sports':0})
train["science"] = train["label"].map({'commerce':0, 'science':1, 'arts':0, 'services':0, 'sports':0})
train["arts"] = train["label"].map({'commerce':0, 'science':0, 'arts':1, 'services':0, 'sports':0})
train["services"] = train["label"].map({'commerce':0, 'science':0, 'arts':0, 'services':1, 'sports':0})
train.describe()

Unnamed: 0,commerce,science,arts,services
count,7294.0,7294.0,7294.0,7294.0
mean,0.251577,0.266383,0.190156,0.155882
std,0.433949,0.442097,0.392451,0.362768
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [7]:
#Creating a combined list of all the words in job summary

#Creating an empty list for iterations and another to combine all the words 
iterator = []
total_words = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(train.axes[0]),1):
    iterator.append(i)
    
#creating the list
for i in iterator:
    job_summary = train.get_value(i,'job summary')
    all_words = re.split(r"\s", job_summary)
    total_words=total_words+all_words    

In [None]:
#joining all the words to form the string in order to convert it ot nltk.text format
total_string=' '.join(word for word in total_words)
total_string

In [None]:
#Converting string to nltk.text format
tokens = nltk.word_tokenize(total_string)   # Convert str to token
text_obj = nltk.text.Text(tokens)           # convert tokens to nltk.text.Text()

In [None]:
#Filtering out stop words
stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in text_obj if w not in stopwords]

In [None]:
#Getting frequency of words in content
content_words_freq = nltk.FreqDist(w.lower() for w in content)

In [None]:
#Constructing list of 4000 words appearing most frequently in the job listings
num_features = 4000
word_features = list(content_words_freq)[:num_features]
print(word_features)
type(word_features)

In [None]:
#Converting strings in train data into nltk.text.Text format in order to use NLP

#Creating empty list for iterations
iterator = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(train.axes[0]),1):
    iterator.append(i)

#Converting and replacing the strings with nltk.text    
for i in iterator:
    job_summary=train.get_value(i,'job summary')  # Pull just the job summary as a str
    tokens = nltk.word_tokenize(job_summary)      # Convert str to token
    text_obj = nltk.text.Text(tokens)             # convert tokens to nltk.text.Text()
    train.set_value(i,'job summary',text_obj)     # Now all of the functions from lab 17 will work with this obj

In [None]:
#defining a function to return whether or not the word features appear in the job summary of a job listing
def document_features(document):
    document_words = set(document)
    features = np.zeros(num_features)
    for i,word in enumerate(word_features):
        features[i] = (word in document_words)
    return features

In [None]:
job_summary=train.get_value(0,'job summary') 
words_in_doc_2 = document_features(job_summary)
freq_count( words_in_doc_2 )