In [4]:
from textblob import TextBlob
import json


def create_training(word_list, article_files, output_file):
    
    positive_training = []
    negative_training = []
    
    for article in article_files:
        with open(article, 'r') as article_file:

            article_text = article_file.read()
            article_blob = TextBlob(article_text.decode('utf-8'))

            for sentence in article_blob.sentences:
                
                is_positive = False
                
                try:
                    for word in sentence.words:

                        if word.string in word_list:
                            if sentence.string not in positive_training:

                                positive_training.append(sentence.string)
                                
                            is_positive = True

                    if not is_positive:                        
                        if sentence.string not in negative_training:
                            negative_training.append(sentence.string)
                            
                except:
                    print sentence.string.decode('utf-8')
                    
    with open(output_file, 'w') as training_file:
        
        training_output = {
            "positive_training": positive_training,
            "negative_training": negative_training
        }
        
        json.dump(training_output, training_file)
        
    print 'Done!'

In [5]:
import json
from textblob import TextBlob
from textblob.classifiers import PositiveNaiveBayesClassifier

def train_positive_classifier(training_file_name, probability_value):
    with open(training_file_name, 'r') as training_file:
        training_data = json.load(training_file)

        positive_training = training_data['positive_training']
        negative_training = training_data['negative_training']

    trained_classifier = PositiveNaiveBayesClassifier(positive_training, negative_training, positive_prob_prior=probability_value)
    return trained_classifier
    

In [30]:
articles = ['article1.txt', 'article2.txt']

increase_words = ['increase', 'increased', 'increases', 'increasing', 'expand', 'expanded', 'expanding', 'jumped', 'grew', 'grown', 'higher', 'growth', 'growing', 'rose', 'boost', 'boosted', 'boosts', 'above', 'winning']
decrease_words = ['lower', 'lowered', 'miss', 'decrease', 'shrunk', 'shrank', 'shrinking', 'decreased', 'decreases', 'drop', 'falling', 'below', 'fell', 'fall', 'fallen', 'losing', 'dissapointing', 'dropped', 'droppping', 'plunge', 'plunges', 'plunging']
consistent_words = ['consistent', 'in line with', 'equal', 'about', 'similar', 'around', 'near']

increase_training_filename = 'increase_training.json'
decrease_training_filename = 'decrease_training.json'
consistent_training_filename = 'consistent_training.json'

create_training(increase_words, articles, increase_training_filename)
create_training(decrease_words, articles, decrease_training_filename)
create_training(consistent_words, articles, consistent_training_filename)

Done!
Done!
Done!


In [31]:
test_increase_classifier = train_positive_classifier(increase_training_filename, 0.01)
test_decrease_classifier = train_positive_classifier(decrease_training_filename, 0.01)
test_consistent_classifier = train_positive_classifier(consistent_training_filename, 0.01)

In [32]:
print repr(test_increase_classifier)
print repr(test_decrease_classifier)
print repr(test_consistent_classifier)

<PositiveNaiveBayesClassifier trained on 105 labeled and 588 unlabeled instances>
<PositiveNaiveBayesClassifier trained on 70 labeled and 623 unlabeled instances>
<PositiveNaiveBayesClassifier trained on 41 labeled and 652 unlabeled instances>


In [37]:
test_string = 'revenue and earnings were unchanged'
print 'Increase Classifier: {}'.format(test_increase_classifier.classify(test_string))
print 'Decrease Classifier: {}'.format(test_decrease_classifier.classify(test_string))
print 'Consistent Classifier: {}'.format(test_consistent_classifier.classify(test_string))

Increase Classifier: False
Decrease Classifier: True
Consistent Classifier: False


In [40]:
def optimize_classifier(training_file, eval_file):
    '''
    input data format:
        training_file='training_data.json',
        test_set=[{
            'text': 'sample test text',
            'value': True
        }]
    '''
    
    with open(eval_file, 'r') as eval_data:
        test_set = json.load(eval_data)
        
        
    a = range(30)
    weight_list = [round(number * .01, 2) for number in a]
    
    for weight_value in weight_list:

        test_classifier = train_positive_classifier(training_file, weight_value)
        error_count = 0
        correct_count = 0
        error_ids = []
        
        for test_item in test_set:
            
            # Handling for failures
            if test_classifier.classify(test_item['text']) != test_item['value']:
                error_count += 1
                if test_item.get('id'):
                    error_ids.append(test_item['id'])
            else:
                correct_count += 1

        print 'For Weight:{}  - Got {} Failures and {} Correct'.format(weight_value, error_count, correct_count)
        if len(error_ids) > 0:
            print 'Failed on test ids {}'.format(', '.join(error_ids))

In [42]:
optimize_classifier('increase_training.json', 'increase_eval.json')

For Weight:0.0  - Got 5 Failures and 6 Correct
Failed on test ids 1, 3, 5, 6, 10
For Weight:0.01  - Got 4 Failures and 7 Correct
Failed on test ids 6, 7, 9, 10
For Weight:0.02  - Got 3 Failures and 8 Correct
Failed on test ids 6, 7, 9
For Weight:0.03  - Got 3 Failures and 8 Correct
Failed on test ids 6, 7, 9
For Weight:0.04  - Got 3 Failures and 8 Correct
Failed on test ids 6, 7, 9
For Weight:0.05  - Got 3 Failures and 8 Correct
Failed on test ids 6, 7, 9
For Weight:0.06  - Got 3 Failures and 8 Correct
Failed on test ids 6, 7, 9
For Weight:0.07  - Got 2 Failures and 9 Correct
Failed on test ids 7, 9
For Weight:0.08  - Got 3 Failures and 8 Correct
Failed on test ids 7, 8, 9
For Weight:0.09  - Got 3 Failures and 8 Correct
Failed on test ids 7, 8, 9
For Weight:0.1  - Got 3 Failures and 8 Correct
Failed on test ids 7, 8, 9
For Weight:0.11  - Got 3 Failures and 8 Correct
Failed on test ids 7, 8, 9
For Weight:0.12  - Got 3 Failures and 8 Correct
Failed on test ids 7, 8, 9
For Weight:0.13  - 