In [1]:
import json
import nltk
import pprint
import requests

from azure_cfg import ta_key

#nltk.download()

In [2]:
# set up global variables
subscription_key = ta_key

text_analytics_base_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.0/"
language_api_url = text_analytics_base_url + "languages"
sentiment_api_url = text_analytics_base_url + "sentiment"
key_phrase_api_url = text_analytics_base_url + "keyPhrases"

image_dir = '../images/'
train_image_dir = '../images/train/'
test_image_dir = '../images/test/'

In [3]:
# evaluate text "documents" using the Microsoft Azure Cognitive Services Text Analytics API
def analyze_text(documents):
    headers   = {"Ocp-Apim-Subscription-Key": subscription_key}
    
    # query API for language analysis
    response  = requests.post(language_api_url, headers=headers, json=documents)
    languages = response.json()
    
    # query API for sentiment analysis
    response  = requests.post(sentiment_api_url, headers=headers, json=documents)
    sentiments = response.json()
    
    # query API for key phrases extraction
    response  = requests.post(key_phrase_api_url, headers=headers, json=documents)
    key_phrases = response.json()
    
    return (languages, sentiments, key_phrases)

In [4]:
# conduct POS tagging and return counts of POS types
def pos_tag_text(documents):
    tag_counts = []
    for document in documents['documents']:
        text = nltk.word_tokenize(document['text'])
        tags = nltk.pos_tag(text)
        frequencies = nltk.FreqDist(tag for (word, tag) in tags)

        tag_count = {
            'ADJ': 0,
            'ADP': 0,
            'ADV': 0,
            'CONJ': 0,
            'DET': 0,
            'NOUN': 0,
            'NUM': 0,
            'PRT': 0,
            'PRON': 0,
            'VERB': 0,
            '.': 0,
            'X': 0
        }

        for tag, count in frequencies.most_common():
            tag_count[tag] = count
            
        tag_counts.append(tag_count)
        
    return tag_counts

In [5]:
# get the question documents from the training set
def get_questions():
    labels = json.load(open('../images/train.json'))
    
    documents = []
    for index, document in enumerate(labels):
        documents.append({
            'id': index,
            'text': document['question']
        })
    
    return {'documents': documents}

In [6]:
documents = get_questions()
pprint.pprint(documents)

{'documents': [{'id': 0, 'text': "What's the name of this product?"},
               {'id': 1, 'text': 'Can you tell me what is in this can please?'},
               {'id': 2,
                'text': 'Is this enchilada sauce or is this tomatoes?  Thank '
                        'you.'},
               {'id': 3, 'text': 'What is the captcha on this screenshot?'},
               {'id': 4, 'text': 'What is this item?'},
               {'id': 5, 'text': "What's this?"},
               {'id': 6, 'text': 'What is in this bottle?'},
               {'id': 7, 'text': 'This item.'},
               {'id': 8, 'text': 'What color do these look?'},
               {'id': 9, 'text': 'Surface look clean? Thank you.'},
               {'id': 10, 'text': 'Is this.'},
               {'id': 11,
                'text': 'What is the sodium content of this can of food?'},
               {'id': 12, 'text': 'what is this?'},
               {'id': 13, 'text': 'What kind of drink is this?'},
               {'id': 

In [7]:
languages, sentiments, key_phrases = analyze_text(documents)
languages, sentiments, key_phrases = languages['documents'], sentiments['documents'], key_phrases['documents']

tag_counts = pos_tag_text(documents)

In [16]:
features = []
for i in range(len(documents['documents'])):
    analysis = {
        'language': languages[i]['detectedLanguages'][0]['iso6391Name'],
        'sentiment': sentiments[i]['score'],
        'numKeyPhrases': len(key_phrases[i]['keyPhrases'])
    }
    features.append({**analysis, **tag_counts[i]})

In [18]:
from pandas import DataFrame
from IPython.display import display

df = DataFrame(data=features)
display(df)

Unnamed: 0,",",.,:,ADJ,ADP,ADV,CC,CD,CONJ,DET,...,VBP,VBZ,VERB,WDT,WP,WRB,X,language,numKeyPhrases,sentiment
0,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,1,0.863549
1,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,0,0.764013
2,,2,,0,0,0,1.0,,0,0,...,,2.0,0,,,,0,en,2,0.984945
3,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,2,0.268598
4,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,1,0.500000
5,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,0,0.839447
6,,1,,0,0,0,,,0,0,...,,1.0,0,,1.0,,0,en,1,0.194486
7,,1,,0,0,0,,,0,0,...,,,0,,,,0,en,1,0.754812
8,,1,,0,0,0,,,0,0,...,1.0,,0,,1.0,,0,en,1,0.898420
9,,2,,0,0,0,,,0,0,...,,,0,,,,0,en,1,0.985198
