In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import re 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.impute import SimpleImputer

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
import spacy

In [60]:
data = pd.read_csv("final_merged_data.csv")
print(data.shape)
data = data.dropna(axis = 0, how = 'any')
print(data.shape)

(44401, 7)
(43371, 7)


In [61]:
data = data.sample(10000)

In [62]:
## Grab a corpus of text as a numpy array instead of series
text = data.questions_title.values.astype('U')

In [63]:
token = RegexpTokenizer(r'[a-z]+')

In [64]:
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1), tokenizer = token.tokenize)
text_counts = cv.fit_transform(text)
text_counts = text_counts.toarray()
vocab = cv.get_feature_names()
text_counts = pd.DataFrame(text_counts, columns=vocab) ## yields pandas dataframe
text_counts.sample(3) 


Unnamed: 0,abilities,ability,absolute,abuse,academia,academics,academy,accent,acceptable,acceptance,...,york,yorker,youre,youth,youtube,youtuber,zone,zoo,zoologist,zoology
1571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Hm, some weird words here and there that remains after preprocessing. 

In [65]:
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(text)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()

## eliminate vocab beginning with numbers
#vocab = [w for w in vocab if re.search(r'[a-z]', w)]
tfDF = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab) ## also yields pandas dataframe
tfDF.sample(3)

Unnamed: 0,abilities,ability,absolute,abuse,academia,academics,academy,accent,acceptable,acceptance,...,york,yorker,youre,youth,youtube,youtuber,zone,zoo,zoologist,zoology
9568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Hm, more weird stuff that remains after preprocessing according to this method

In [66]:
temp = pd.merge(tfDF, data, left_index = True, right_index = True)

In [67]:
temp.shape
temp.sample(2)

Unnamed: 0,abilities,ability,absolute,abuse,academia,academics,academy,accent,acceptable,acceptance,...,zoo,zoologist,zoology,professionals_id,professionals_headline,professionals_industry,answers_question_id,questions_id,questions_title,questions_body
5376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6774ee9e8f9247728dbbe896726e911a,"MSN, RN, WCC, OMS, ONC- Medical Surgical Clini...",Hospital & Health Care,12b51fa9cf464c73896ed0b17c8fba23,12b51fa9cf464c73896ed0b17c8fba23,path order school nurse,junior school children school day education sc...
6306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,aa916ae133d04bfdb7cdf610f7e7a745,Product Manager,Management Consulting,fdbc63ea39a342bc9dbcecd9ada526d6,fdbc63ea39a342bc9dbcecd9ada526d6,difference business majoring marketing,business marketing dont college business marke...


In [68]:
X = tfDF
y = data['professionals_industry']


In [69]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.25, random_state=2)
#imp = SimpleImputer(missing_values=np.nan, strategy='median')
#X = imp.fit_transform(X)


In [70]:
knn = KNeighborsClassifier(n_neighbors=31, weights='distance')
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

In [72]:

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average=None))
print("Recall:",metrics.recall_score(y_test, y_pred, average='micro'))

Accuracy: 0.1152
Precision: [0.         0.         0.         0.21782178 0.         0.
 0.         0.         0.         0.3        0.         0.
 0.5        0.         0.         0.5        0.         0.
 0.         0.         0.16666667 0.09876543 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.19047619 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.04545455 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.21428571
 0.         0.         0.         0.         0.         0.
 0.1344086  0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.      

Oi vey, this does not look good. Looks like we have to try something else!