# Extract Raw Attachments Into JSON

Format
```
[
    {
        "content": "Mr. Kunapot Pairat\nExpected Jobs : Environmental Assistant Manager\n",
        "type": "resume", 
        "name": "resume kunapot 1.docx"
    },
    {
        "content": "48/214 \u0e2d\u0e40\u0e04\u0e07\u0e08\u0e31\u0e04\u0e25\u0e19\u0e48\u0e2b\u0e30\n",
        "type": "resume", 
        "name": "\u0e32\u0e22\u0e08\u0e14\u0e2b\u0e21\u0e31\u0e21\u0e04\u0e23\u0e07\u0e2a.pdf"
    
    }
]
```

In [18]:
import os, json, subprocess
import textract

def readDocByFileName(filename):
    readtext = subprocess.Popen("antiword -m UTF-8.txt \"" + filename + "\"", shell=True, stdout=subprocess.PIPE).stdout.read()
    return readtext

path = '/notebooks/data/raw/'
folders = os.listdir(path)

attachments = []
for folder in folders:
    directory = path + folder
    for root, directories, files in os.walk(directory):
        for fileName in files:
            filePath = os.path.join(root, fileName)
            fileExtension = filePath.split('.')[-1]
            
            if fileExtension == 'pdf':
                try:
                    attachments.append({ "name": fileName, "type": folder, "content": textract.process(filePath, method='pdftotext') })
                except:
                    print 'Cannot read .PDF file.'
                    pass
            elif fileExtension == 'docx':
                attachments.append({ "name": fileName, "type": folder, "content": textract.process(filePath) })
            elif fileExtension == 'doc':
                attachments.append({ "name": fileName, "type": folder, "content": readDocByFileName(path + folder + '/' + fileName) })

with open('/notebooks/data/attachments.json', 'w') as outfile:
    json.dump(attachments, outfile, indent=4)


Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.
Cannot read .PDF file.


# Word Segmentation

In [21]:
import json
import tnthai.segment as tn

with open('/notebooks/data/attachments.json', 'r') as file:
    attachments = json.load(file)
    segmented = []

    for attachment in attachments:
        content = attachment['content'].encode('utf-8')
        term = tn.UnsafeSegment(content)
        segmented.append(term[1][0])

with open('/notebooks/data/segments.json', 'w') as outfile:
    json.dump(segmented, outfile, indent=4)

# Word Count

In [3]:
import json
from collections import Counter

wordList = []

with open('/notebooks/data/segments.json', 'r') as file:
    segments = json.load(file)
    for words in segments:
        for word in words:
            wordList.append(word)

counter = Counter(wordList)
print counter



# X Y

In [7]:
import json

with open('/notebooks/data/segments.json', 'r') as file:
    segments = json.load(file)

with open('/notebooks/data/attachments.json', 'r') as file:
    attachments = json.load(file)

x = []
y = []
for words in segments:
    x.append(" ".join(words))


for attachment in attachments:
    y.append(attachment["type"])

print len(x)
print len(y)

with open('/notebooks/data/x.json', 'w') as outfile:
    json.dump(x, outfile, indent=4)

with open('/notebooks/data/y.json', 'w') as outfile:
    json.dump(y, outfile, indent=4)

4969
4969


# Classification Models

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix, recall_score, precision_score, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

with open('/notebooks/data/x.json', 'r') as file:
    x = json.load(file)

with open('/notebooks/data/y.json', 'r') as file:
    y = json.load(file)
    
vectorizer = TfidfVectorizer()
Xtrain = vectorizer.fit_transform(x).toarray()
Ytrain = np.asarray(y)

clftextfile = {
    'MultinomialNB': MultinomialNB(alpha=0.01),
    'GaussianNB': GaussianNB(),
    'BernoulliNB': BernoulliNB(alpha=0.01) ,
}

modelMultinomialNB = clftextfile["MultinomialNB"]
YmodelMultinomialNBPredict = modelMultinomialNB.fit(Xtrain, Ytrain).predict(Xtrain)
# joblib.dump(modelMultinomialNB.fit(Xtrain, Ytrain), 'modelMultinomialNB.pkl') 

# modelGaussianNB = clftextfile["GaussianNB"]
# # YmodelGaussianNBPredict = modelGaussianNB.fit(Xtrain, Ytrain).predict(Xtrain)
# joblib.dump(modelGaussianNB.fit(Xtrain, Ytrain), 'modelGaussianNB.pkl') 

# modelBernoulliNB = clftextfile["BernoulliNB"]
# # YmodelBernoulliNBPredict = modelBernoulliNB.fit(Xtrain, Ytrain).predict(Xtrain)
# joblib.dump(modelBernoulliNB.fit(Xtrain, Ytrain), 'modelBernoulliNB.pkl') 

print confusion_matrix(Ytrain, YmodelMultinomialNBPredict, labels=list(set(Ytrain)))
# print confusion_matrix(Ytrain, YmodelGaussianNBPredict, labels=list(set(Ytrain)))
# print confusion_matrix(Ytrain, YmodelBernoulliNBPredict, labels=list(set(Ytrain)))

print classification_report(Ytrain, YmodelMultinomialNBPredict)
# print classification_report(Ytrain, YmodelGaussianNBPredict)
# print classification_report(Ytrain, YmodelBernoulliNBPredict)

print cross_val_score(modelMultinomialNB, Xtrain, Ytrain, cv=10)
# print cross_val_score(modelGaussianNB, Xtrain, Ytrain, cv=10)
# print cross_val_score(modelBernoulliNB, Xtrain, Ytrain, cv=10)

[[1157    0  202    0    0   38    5    6   12]
 [   0  200    0    0    0    0    0    0    0]
 [  43    0 1660    0    0    2   11   10    1]
 [   0    0    1  808    0    0    0    0    0]
 [   0    0   27    0  305    0    0    0    0]
 [   2    0   14    0    0   74    0    0    0]
 [   0    0  123    0    0    0   40    0    0]
 [   0    0    4    0    0    2    0  160    0]
 [   0    0    2    0    0    0    0    0   60]]
                precision    recall  f1-score   support

            cv       0.91      0.96      0.94       166
         other       0.82      0.96      0.88      1727
        resume       0.96      0.81      0.88      1420
    transcript       0.71      0.25      0.37       163
ประวัติส่วนตัว       0.82      0.97      0.89        62
          ราคา       1.00      1.00      1.00       200
      สมัครงาน       0.64      0.82      0.72        90
      หลักสูตร       1.00      1.00      1.00       809
          อบรม       1.00      0.92      0.96       332

   av