In [1]:
import requests

import json
import urllib

import csv
import itertools

In [6]:
# ibmTest.py
# 
# This file tests all 11 classifiers using the NLClassifier IBM Service
# previously created using ibmTrain.py
# 
# TODO: You must fill out all of the functions in this file following 
# 		the specifications exactly. DO NOT modify the headers of any
#		functions. Doing so will cause your program to fail the autotester.
#
#		You may use whatever libraries you like (as long as they are available
#		on CDF). You may find json, request, or pycurl helpful.
#		You may also find it helpful to reuse some of your functions from ibmTrain.py.
#

In [7]:
def get_classifier_ids(username, password):
    # Retrieves a list of classifier ids from a NLClassifier service 
    # an outputfile named ibmTrain#.csv (where # is n_lines_to_extract).
    #
    # Inputs: 
    # 	username - username for the NLClassifier to be used, as a string
    #
    # 	password - password for the NLClassifier to be used, as a string
    #
    #		
    # Returns:
    #	a list of classifier ids as strings
    #
    # Error Handling:
    #	This function should throw an exception if the classifiers call fails for any reason
    #

    url = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers"

    response = requests.get(url, data={}, auth=(username, password))
    
    if not response.ok:
        raise Exception("Classifier call failed.")
        
    id_list = []
    for classifier in response.json()['classifiers']:
        id_list.append(classifier['classifier_id'])
        
    return id_list

In [8]:
get_classifier_ids(username, password)

[u'c7fa4ax22-nlc-1269', u'c7fa4ax22-nlc-1270', u'c7fa49x23-nlc-1182']

In [9]:
def assert_all_classifiers_are_available(username, password, classifier_id_list):
    # Asserts all classifiers in the classifier_id_list are 'Available' 
    #
    # Inputs: 
    # 	username - username for the NLClassifier to be used, as a string
    #
    # 	password - password for the NLClassifier to be used, as a string
    #
    #	classifier_id_list - a list of classifier ids as strings
    #		
    # Returns:
    #	None
    #
    # Error Handling:
    #	This function should throw an exception if the classifiers call fails for any reason AND 
    #	It should throw an error if any classifier is NOT 'Available'
    #
    
    url = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/"

    for c_id in classifier_id_list:
        response = requests.get(url + c_id, data={}, auth=(username, password))
        if not response.ok:
            raise Exception("Classifier call failed.")
        if response.json()['status'] != "Available":
            # The term "throwing an error" is undefined in python,
            # so we will print a message and then raise an exception
            raise Exception("Classifier {} is {}, not Available".format(c_id, response.json()['status']))

    return

In [10]:
assert_all_classifiers_are_available(username, password, get_classifier_ids(username, password))

In [11]:
def classify_single_text(username, password, classifier_id, text):
    # Classifies a given text using a single classifier from an NLClassifier 
    # service
    #
    # Inputs: 
    # 	username - username for the NLClassifier to be used, as a string
    #
    # 	password - password for the NLClassifier to be used, as a string
    #
    #	classifier_id - a classifier id, as a string
    #		
    #	text - a string of text to be classified, not UTF-8 encoded
    #		ex. "Oh, look a tweet!"
    #
    # Returns:
    #	A "classification". Aka: 
    #	a dictionary containing the top_class and the confidences of all the possible classes 
    #	Format example:
    #		{'top_class': 'class_name',
    #		 'classes': [
    #					  {'class_name': 'myclass', 'confidence': 0.999} ,
    #					  {'class_name': 'myclass2', 'confidence': 0.001}
    #					]
    #		}
    #
    # Error Handling:
    #	This function should throw an exception if the classify call fails for any reason 
    #

    url = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/"
    text = urllib.quote(text.encode('utf8'))
    url = url + classifier_id + "/classify?text=" + text
    
    response = requests.get(url, data={}, auth=(username, password))
    if not response.ok:
        raise Exception("Classifier call failed.")

    return response.json()

In [12]:
# @dittebb pulled a muscle in the shoulder. I sure hope it gets better soon.,0
resp = classify_single_text(username, password, "c7fa4ax22-nlc-1270", "@dittebb pulled a muscle in the shoulder. I sure hope it gets better soon.")

In [13]:
resp

{u'classes': [{u'class_name': u'0', u'confidence': 0.995990028092351},
  {u'class_name': u'4', u'confidence': 0.004009971907648813}],
 u'classifier_id': u'c7fa4ax22-nlc-1270',
 u'text': u'@dittebb pulled a muscle in the shoulder. I sure hope it gets better soon.',
 u'top_class': u'0',
 u'url': u'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/c7fa4ax22-nlc-1270'}

In [14]:
def classify_all_texts(username, password, input_csv_name):
        # Classifies all texts in an input csv file using all classifiers for a given NLClassifier
        # service.
        #
        # Inputs:
        #       username - username for the NLClassifier to be used, as a string
        #
        #       password - password for the NLClassifier to be used, as a string
        #      
        #       input_csv_name - full path and name of an input csv file in the 
        #              6 column format of the input test/training files
        #
        # Returns:
        #       A dictionary of lists of "classifications".
        #       Each dictionary key is the name of a classifier.
        #       Each dictionary value is a list of "classifications" where a
        #       "classification" is in the same format as returned by
        #       classify_single_text.
        #       Each element in the main dictionary is:
        #       A list of dictionaries, one for each text, in order of lines in the
        #       input file. Each element is a dictionary containing the top_class
        #       and the confidences of all the possible classes (ie the same
        #       format as returned by classify_single_text)
        #       Format example:
        #              {‘classifiername’:
        #                      [
        #                              {'top_class': 'class_name',
        #                              'classes': [
        #                                        {'class_name': 'myclass', 'confidence': 0.999} ,
        #                                         {'class_name': 'myclass2', 'confidence': 0.001}
        #                                          ]
        #                              },
        #                              {'top_class': 'class_name',
        #                              ...
        #                              }
        #                      ]
        #              , ‘classifiername2’:
        #                      [
        #                      …      
        #                      ]
        #              …
        #              }
        #
        # Error Handling:
        #       This function should throw an exception if the classify call fails for any reason
        #       or if the input csv file is of an improper format.
        #
        
        return_dict = {}
        
        id_list = get_classifier_ids(username, password)
        for c_id in id_list:
            return_dict[c_id] = []
        
        with open(input_csv_name, "r") as _input:
                reader = csv.reader(_input)
                for row in itertools.islice(reader, 0, None):
                    tweet = row[5].strip()
                    t_class = int(row[0])
                    
                    for c_id in id_list:
                        return_dict[c_id].append(classify_single_text(username, password, c_id, tweet))
        
        return return_dict

In [15]:
def compute_accuracy_of_single_classifier(classifier_dict, input_csv_file_name):
    # Given a list of "classifications" for a given classifier, compute the accuracy of this
    # classifier according to the input csv file
    #
    # Inputs:
    # 	classifier_dict - A list of "classifications". Aka:
    #		A list of dictionaries, one for each text, in order of lines in the 
    #		input file. Each element is a dictionary containing the top_class
    #		and the confidences of all the possible classes (ie the same
    #		format as returned by classify_single_text) 	
    # 		Format example:
    #			[
    #				{'top_class': 'class_name',
    #			 	 'classes': [
    #						  	{'class_name': 'myclass', 'confidence': 0.999} ,
    #						  	{'class_name': 'myclass2', 'confidence': 0.001}
    #							]
    #				},
    #				{'top_class': 'class_name',
    #				...
    #				}
    #			]
    #
    #	input_csv_name - full path and name of an input csv file in the  
    #		6 column format of the input test/training files
    #
    # Returns:
    #	The accuracy of the classifier, as a fraction between [0.0-1.0] (ie percentage/100). \
    #	See the handout for more info.
    #
    # Error Handling:
    # 	This function should throw an error if there is an issue with the 
    #	inputs.
    #

    num_correct = 0
    
    with open(input_csv_file_name, "r") as _input:
            reader = csv.reader(_input)
            for i, row in enumerate(itertools.islice(reader, 0, None)):
                tweet_class = int(row[0])

                top_class = int(classifier_dict[i]['top_class'])
                num_correct += (tweet_class == top_class)
                
    return num_correct / float(len(classifier_dict))

In [18]:
def compute_average_confidence_of_single_classifier(classifier_dict, input_csv_file_name):
    # Given a list of "classifications" for a given classifier, compute the average 
    # confidence of this classifier wrt the selected class, according to the input
    # csv file. 
    #
    # Inputs:
    # 	classifier_dict - A list of "classifications". Aka:
    #		A list of dictionaries, one for each text, in order of lines in the 
    #		input file. Each element is a dictionary containing the top_class
    #		and the confidences of all the possible classes (ie the same
    #		format as returned by classify_single_text) 	
    # 		Format example:
    #			[
    #				{'top_class': 'class_name',
    #			 	 'classes': [
    #						  	{'class_name': 'myclass', 'confidence': 0.999} ,
    #						  	{'class_name': 'myclass2', 'confidence': 0.001}
    #							]
    #				},
    #				{'top_class': 'class_name',
    #				...
    #				}
    #			]
    #
    #	input_csv_name - full path and name of an input csv file in the  
    #		6 column format of the input test/training files
    #
    # Returns:
    #	The average confidence of the classifier, as a number between [0.0-1.0]
    #	See the handout for more info.
    #
    # Error Handling:
    # 	This function should throw an error if there is an issue with the 
    #	inputs.
    #

    # Sums for [incorrect, correct] guesses
    confidence_sums = [0, 0]
    # Number of [incorrect, correct] guesses
    total_number_seen = [0, 0]
    
    with open(input_csv_file_name, "r") as _input:
            reader = csv.reader(_input)
            for i, row in enumerate(itertools.islice(reader, 0, None)):
                tweet_class = int(row[0])
                
                most_conf_class = classifier_dict[i]["classes"][0]
                is_classification_correct = int(most_conf_class["class_name"]) == tweet_class
                
                confidence_sums[is_classification_correct] += most_conf_class["confidence"]
                total_number_seen[is_classification_correct] += 1
    
    confidence_sums[0] /= float(total_number_seen[0])
    confidence_sums[1] /= float(total_number_seen[1])
    
    return confidence_sums

### Main

In [19]:
input_test_data = 'datasets/testdata.manualSUBSET.2009.06.14.csv'
username = "2bd0e6c7-5784-4967-860c-a9778754fdee"
password = "rFs4Solusscl"

#STEP 1: Ensure all classifiers are ready for testing
try:
    assert_all_classifiers_are_available(username, password, get_classifier_ids(username, password))
except Exception as e:
    print "Error: {}".format(e)

In [21]:
#STEP 2: Test the test data on all classifiers
try:
    classd_dict = classify_all_texts(username, password, input_test_data)
except Exception as e:
    print "Error: {}".format(e)

In [22]:
# 500
classd_dict['c7fa49x23-nlc-1182'][0]

{u'classes': [{u'class_name': u'0', u'confidence': 0.9918643770825553},
  {u'class_name': u'4', u'confidence': 0.008135622917444761}],
 u'classifier_id': u'c7fa49x23-nlc-1182',
 u'text': u'@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.',
 u'top_class': u'0',
 u'url': u'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/c7fa49x23-nlc-1182'}

In [23]:
# 2500
classd_dict["c7fa4ax22-nlc-1269"][0]

{u'classes': [{u'class_name': u'0', u'confidence': 0.5941090107927917},
  {u'class_name': u'4', u'confidence': 0.4058909892072084}],
 u'classifier_id': u'c7fa4ax22-nlc-1269',
 u'text': u'@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.',
 u'top_class': u'0',
 u'url': u'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/c7fa4ax22-nlc-1269'}

In [24]:
# 5000
classd_dict["c7fa4ax22-nlc-1270"][0]

{u'classes': [{u'class_name': u'0', u'confidence': 0.9695191215271693},
  {u'class_name': u'4', u'confidence': 0.030480878472830677}],
 u'classifier_id': u'c7fa4ax22-nlc-1270',
 u'text': u'@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.',
 u'top_class': u'0',
 u'url': u'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/c7fa4ax22-nlc-1270'}

In [25]:
#STEP 3: Compute the accuracy for each classifier
#STEP 4: Compute the confidence of each class for each classifier

per_classifier_acc = []
per_classifier_confidence = [] # pairs per classifier (one for each class)
for c_id in get_classifier_ids(username, password):
    per_classifier_list = classd_dict[c_id]
    
    print "Classifier: ", c_id

    try:
        acc = compute_accuracy_of_single_classifier(per_classifier_list, input_test_data)
    except Exception as e:
        print "Error: {}".format(e)
    else:
        per_classifier_acc.append(acc)
        print "Accuracy: ", acc

    try:
        conf = compute_average_confidence_of_single_classifier(per_classifier_list, input_test_data)
    except Exception as e:
        print "Error: {}".format(e)
    else:
        per_classifier_confidence.append(conf)
        print "Confidence: ", conf

Classifier:  c7fa4ax22-nlc-1269
Accuracy:  0.701949860724
Confidence:  [0.8783337808603239, 0.94176920671881]
Classifier:  c7fa4ax22-nlc-1270
Accuracy:  0.701949860724
Confidence:  [0.8598635146732335, 0.9264322311912503]
Classifier:  c7fa49x23-nlc-1182
Accuracy:  0.66573816156
Confidence:  [0.8672577321084317, 0.9157234826881114]


In [147]:
# 2500, 5000, 500
get_classifier_ids(username, password)

[u'c7fa4ax22-nlc-1269', u'c7fa4ax22-nlc-1270', u'c7fa49x23-nlc-1182']

In [153]:
per_classifier_acc

[0.7019498607242339, 0.7019498607242339, 0.6657381615598886]

In [154]:
# Incorrect, correct
per_classifier_confidence

[[0.8783337808603239, 0.94176920671881],
 [0.8598635146732335, 0.9264322311912503],
 [0.8672577321084317, 0.9157234826881114]]

In [132]:
c2500 = classd_dict['c7fa4ax22-nlc-1269']
c5000 = classd_dict['c7fa4ax22-nlc-1270']

In [142]:
c25_count = 0
c50_count = 0
with open('datasets/testdata.manualSUBSET.2009.06.14.csv', "r") as _input:
    reader = csv.reader(_input)
    for i, row in enumerate(itertools.islice(reader, 0, None)):
        c25_count += int(row[0]) == int(c2500[i]["top_class"])
        c50_count += int(row[0]) == int(c5000[i]["top_class"])
        print row[-1] == c5000[i]['text'], row[0], c2500[i]["top_class"], c5000[i]["top_class"], c25_count, c50_count

True 4 0 0 0 0
True 4 4 4 1 1
True 4 4 4 2 2
True 4 0 0 2 2
True 4 0 0 2 2
True 4 4 4 3 3
True 0 0 4 4 3
True 4 4 4 5 4
True 4 4 4 6 5
True 4 4 4 7 6
True 0 0 0 8 7
True 4 0 4 8 8
True 4 4 4 9 9
True 0 0 4 10 9
True 4 0 0 10 9
True 0 0 4 11 9
True 4 4 4 12 10
True 0 0 0 13 11
True 4 0 0 13 11
True 4 4 4 14 12
True 4 4 4 15 13
True 4 4 0 16 13
True 4 4 4 17 14
True 4 0 4 17 15
True 4 4 4 18 16
True 4 0 4 18 17
True 4 4 4 19 18
True 4 4 4 20 19
True 4 4 4 21 20
True 4 4 4 22 21
True 0 4 0 22 22
True 0 4 4 22 22
True 0 0 4 23 22
True 0 0 0 24 23
True 0 0 0 25 24
True 4 4 4 26 25
True 4 4 0 27 25
True 4 4 4 28 26
True 4 4 4 29 27
True 0 0 0 30 28
True 0 0 0 31 29
True 0 4 4 31 29
True 0 4 4 31 29
True 0 0 0 32 30
True 0 4 4 32 30
True 4 0 4 32 31
True 0 4 4 32 31
True 4 4 4 33 32
True 4 4 4 34 33
True 4 4 0 35 33
True 0 0 0 36 34
True 4 4 4 37 35
True 4 4 4 38 36
True 0 0 4 39 36
True 0 0 0 40 37
True 4 4 4 41 38
True 4 4 4 42 39
True 4 4 4 43 40
True 0 0 0 44 41
True 4 0 0 44 41
True 4 4 

In [155]:
c25_conf = 0
c50_conf = 0
with open('datasets/testdata.manualSUBSET.2009.06.14.csv', "r") as _input:
    reader = csv.reader(_input)
    for i, row in enumerate(itertools.islice(reader, 0, None)):
        actual_class = int(row[0])
        
        class0 = c2500[i]["classes"][0]["class_name"]
        conf0 = c2500[i]["classes"][0]["confidence"]
        
        class1 = c2500[i]["classes"][1]["class_name"]
        conf1 = c2500[i]["classes"][1]["confidence"]

        #selected_index = next(index for (index, d) in enumerate(conf_list) if d["class_name"] == top_class)
        #confidence_sums[tweet_class / 4] += conf_list[selected_index]["confidence"]
        
        print actual_class == int(class0), conf0

False 0.594109010793
True 0.833372883793
True 0.994297037853
False 0.925776787261
False 0.761682483917
True 0.964038764622
True 0.675257328087
True 0.994124253741
True 0.957353384661
True 0.994521193364
True 0.861516535924
False 0.782268259084
True 0.995863140786
True 0.631433517035
False 0.609723912169
True 0.986952520095
True 0.974833775911
True 0.962148141958
False 0.879201303981
True 0.962473580512
True 0.978490771472
True 0.789603688858
True 0.975537957902
False 0.995356939368
True 0.995692838113
False 0.969377728616
True 0.911339422317
True 0.826478933656
True 0.974684945777
True 0.954754465685
False 0.949260967649
False 0.973851763333
True 0.996336492761
True 0.993791642773
True 0.707637073163
True 0.9710868746
True 0.990851804916
True 0.96842419971
True 0.741974467497
True 0.995890526921
True 0.995673578881
False 0.972642985095
False 0.604265443469
True 0.702149115612
False 0.833599665794
False 0.967926334309
False 0.950811428738
True 0.973141113436
True 0.977797724388
True 0.8