In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')

In [2]:
#Importing complete and labled job summary data
complete_data = pd.read_csv('complete_labeled_job_summary_data.csv',encoding = "ISO-8859-1")
complete_data = complete_data.drop('Unnamed: 0',axis=1)

#converting text data into lower case letters in order to process better
complete_data["job summary"]=complete_data["job summary"].str.lower()

In [3]:
#Splitting the data into test and train set and resetting their index
train, test = train_test_split(complete_data, test_size = 0.2)
train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

In [4]:
#exploring train set
print(train.info())
print('\n',train.describe())
print('\n',train['label'].value_counts())

#exploring test set
print('\n',test.info())
print('\n',test.describe())
print('\n',test['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 3 columns):
job title      7294 non-null object
job summary    7294 non-null object
label          7294 non-null object
dtypes: object(3)
memory usage: 85.5+ KB
None

        job title                                        job summary    label
count       7294                                               7294     7294
unique      4255                                               5285        5
top         Host  be a part of what's trending at today's chili'...  science
freq         269                                                106     1929

 science     1929
commerce    1829
arts        1436
services    1122
sports       978
Name: label, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824 entries, 0 to 1823
Data columns (total 3 columns):
job title      1824 non-null object
job summary    1824 non-null object
label          1824 non-null object
dtypes: object(3)
memory u

In [5]:
#Creating dummy variables for catagories
train['label_num'] = train["label"].map({'commerce':0, 'science':1, 'arts':2, 'services':3, 'sports':4})
test['label_num'] = test["label"].map({'commerce':0, 'science':1, 'arts':2, 'services':3, 'sports':4})

In [6]:
#Creating a combined list of all the words in job summary for training data

#Creating empty list for iterations and other for words in job summary
iterator = []
total_words_list=[]

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(train.axes[0]),1):
    iterator.append(i)
    
#creating the list
for i in iterator:
    job_summary=train.get_value(i,'job summary')
    words = re.split(r"\s", job_summary)
    total_words_list.append(words)

#Combining all the nested arrays to form one single array    
total_words_list = [y for x in total_words_list for y in x]

#Creating a combined list of all the words in job summary for test data

#Creating empty list for iterations and other for words in job summary
iterator = []
total_words_list_test=[]

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(test.axes[0]),1):
    iterator.append(i)
    
#creating the list
for i in iterator:
    job_summary=test.get_value(i,'job summary')
    words_test = re.split(r"\s", job_summary)
    total_words_list_test.append(words)

#Combining all the nested arrays to form one single array    
total_words_list_test = [y for x in total_words_list_test for y in x]

In [7]:
#Create a string from the array of the words for training data
total_string = ' '.join(w for w in total_words_list)

#Create a string from the array of the words for test data
total_string_test = ' '.join(w for w in total_words_list_test)

In [8]:
#Convert string to nltk.text format for training data
tokens = nltk.word_tokenize(total_string)      # Convert str to token
nltk_words = nltk.text.Text(tokens)            # convert tokens to nltk.text.Text()

#Convert string to nltk.text format for test data
tokens_test = nltk.word_tokenize(total_string_test)      # Convert str to token
nltk_words_test = nltk.text.Text(tokens_test)            # convert tokens to nltk.text.Text()

In [9]:
#Removing stop words
#identifying stopwords in english
stopwords = nltk.corpus.stopwords.words('english')

#Remove the stop words from list of words for training data
content = [w for w in nltk_words if w not in stopwords]

#Remove the stop words from list of words for test data
content_test = [w for w in nltk_words_test if w not in stopwords]

In [10]:
#Finding out features out of content for training data
#Finding out frequency of words
freq_words = nltk.FreqDist(w for w in content)

#Finding features
num_features = 20000
word_features = list(freq_words)[:num_features]
print(word_features)
type(word_features)

#Finding out features out of content for test data
#Finding out frequency of words
freq_words_test = nltk.FreqDist(w for w in content_test)

#Finding features
num_features_test = 4000
word_features_test = list(freq_words_test)[:num_features_test]
print(word_features_test)
type(word_features_test)

['operates', 'organization', 'operate', 'keep', 'acls', 'accounts', 'independent', '100', 'administer', 'calculator', 'patients', 'employment', 'operator', '-direct', 'devices', '%', 'recruitment', 'equal', 'campus', 'desk', 'american', 'disability', 'investment', 'bls', 'hazards', 'areas', 'regard', 'help', 'qualified', 'laws', '-spiritual', ':', 'plans', 'functions', 'protestant', 'existence', 'effectively', '.', 'consistent', 'handicap', 'extend', 'attitude', 'join', 'excellent', 'origin', 'balance', 'providers', 'affecting', 'offers', 'applicants', 'search', 'graduate', 'florida', 'job', 'localize', 'radio-sensitive', 'stoop', 'plan', 'technology', '-employee', 'national', 'see', 'committed', ')', 'ability', 'together', 'prepare', 'local', 'co-workers', 'standards', 'kids', 'wellness', 'drives', 'diagnosis', 'sufficient', 'ingenuity', 'diagnostic', 'dynamic', 'workplace', 'based', 'medical', 'directly', 'methods', '-retirement', 'hear', 'monitor', 'valuable', 'lift', 'laser', 'cult

list

In [11]:
#Defining function for getting features out of texts for train data
def document_features(document):
    document_words = set(document)
    features = np.zeros(num_features)
    for i,word in enumerate(word_features):
        features[i] = (word in document_words)
    return features

#Defining function for getting features out of texts for test data
def document_features_test(document):
    document_words_test = set(document)
    features_test = np.zeros(num_features_test)
    for i,word in enumerate(word_features_test):
        features_test[i] = (word in document_words_test)
    return features_test

In [12]:
#Converting strings in train data into nltk.text.Text format in order to use NLP
#Creating empty list for iterations
iterator = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(train.axes[0]),1):
    iterator.append(i)

#Converting and replacing the strings with nltk.text    
for i in iterator:
    job_summary=train.get_value(i,'job summary')  # Pull just the job summary as a str
    tokens = nltk.word_tokenize(job_summary)      # Convert str to token
    text_obj = nltk.text.Text(tokens)             # convert tokens to nltk.text.Text()
    train.set_value(i,'job summary',text_obj)     # Now all of the functions from lab 17 will work with this obj

#Converting strings in test data into nltk.text.Text format in order to use NLP
#Creating empty list for iterations
iterator = []

#Creating a list with length equal to number of rows in the dataframe
for i in range (0,len(test.axes[0]),1):
    iterator.append(i)

#Converting and replacing the strings with nltk.text    
for i in iterator:
    job_summary=test.get_value(i,'job summary')  # Pull just the job summary as a str
    tokens = nltk.word_tokenize(job_summary)      # Convert str to token
    text_obj = nltk.text.Text(tokens)             # convert tokens to nltk.text.Text()
    test.set_value(i,'job summary',text_obj)     # Now all of the functions from lab 17 will work with this obj

In [13]:
#Creating set of features for train data
num_job_listing = len(train.axes[0])
X = np.zeros([num_job_listing,num_features])
for i in range(num_job_listing):
    job_summary=train.get_value(i,'job summary')
    X[i,:] = document_features(job_summary)

#Creating set of features for test data
num_job_listing_test = len(test.axes[0])
X_test = np.zeros([num_job_listing_test,num_features_test])
for i in range(num_job_listing_test):
    job_summary_test=test.get_value(i,'job summary')
    X_test[i,:] = document_features_test(job_summary_test)

In [14]:
#Creating a set of classes training data
y=train['label_num']
y=y.as_matrix(columns=None)

#Creating a set of classes test data
y_test=test['label_num']
y_test=y_test.as_matrix(columns=None)

In [15]:
#Useing k_NN to classify
k = 5
model = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(model, X, y, cv=10)
print(scores)

MemoryError: 

In [None]:
y_pred=model.predict(X)