In [1]:
# Run this every time you open the spreadsheet
% load_ext autoreload
% autoreload 2
import lib


# Load the data and our rule-based classifier

In [2]:
# Load the data.
# This function returns "tweets" and "test_tweets", both lists of tweets
tweets, test_tweets = lib.read_data(train_path="../data/labeled-data-singlelabels-train.csv", 
                                    test_path="../data/labeled-data-singlelabels-test.csv")



In [3]:
def classify_rb(tweet):
    tweet = str(tweet).lower()  # this makes the tweet lower-case, so we don't have to worry about matching case

    if "medicine" in tweet or "first aid" in tweet:
        return "Medical"
    elif "power" in tweet or "battery" in tweet:
        return "Energy"
    elif "water" in tweet or "bottled" in tweet:
        return "Water"
    elif "food" in tweet or "perishable" in tweet or "canned" in tweet:
        return "Food"
    else:
        return "None"


# Python refresher

Let's review some Python concepts before we write our evaluation code.

### Lists
In Python, a _list_ is an ordered collection of items. The items can be strings, numbers, booleans, or any other kind of Python object. 

You can create lists like this:
```
integer_list = [5, 6, 7, 8]
string_list = ['hello', 'world']
bool_list = [False, True, False, False, True]
```

If you want a list of the numbers up to (but not including) 10, you can use the `range` function.
```
upto10_list = range(10)
```
This gives you [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].

In [4]:
# Exercise 1(a).
# Create a list called "my_numbers" that contains the numbers from 0 to 6 (inclusive) and then print it
my_numbers = [0, 1, 2, 3, 4, 5, 6]
print(my_numbers)


[0, 1, 2, 3, 4, 5, 6]


In [5]:
# Exercise 1(b).
# Now use the range() function to create "my_numbers", and print the result.
# It should match the previous cell.
# Hint: look carefully at the range(10) example above.
my_numbers = list(range(7))
print(my_numbers)


[0, 1, 2, 3, 4, 5, 6]


### For loops

In Python, a _for loop_ allows you to iterate over a list.
```
shopping_list = ['bread', 'bananas', 'milk']

for item in shopping_list:
    print item
```

For example, the code above prints out the following output:

```
bread
bananas
milk
```

In [6]:
# Exercise 2. 
# Write a for-loop that iterates through my_numbers, and prints the square of each number
# You should see the following numbers print out, one per line: 0, 1, 4, 9, 16, 25, 36
for n in my_numbers:
    print(n * n)


0
1
4
9
16
25
36


In [7]:
# Exercise 3.
# Use a for-loop to calculate the sum of the squares of my_numbers.
# Save the result in a variable called "sum_squares".
# Hint: start by setting sum_squares to 0 before starting the for-loop.

#### YOUR CODE STARTS HERE ####
sum_squares = 0
for n in my_numbers:
    sum_squares += n * n

#### YOUR CODE ENDS HERE ####

print("Testing: sum_squares = %i" % sum_squares)
print("CORRECT" if sum_squares == 91 else "INCORRECT")


Testing: sum_squares = 91
CORRECT


### Incrementing

If you have an integer variable e.g. `x=3` and you want to increase `x` by 1 (which is called _incrementing_), then you can write
```
x = x+1
```
or, in shorthand:
```
x += 1
```

This can be useful when you're using `x` to count something. For example:
```
ages = [7, 14, 23, 3, 10, 19]

num_adults = 0
for age in ages:
    if age >= 18:
        num_adults += 1

print num_adults
```
What should this code print out?

In [8]:
# Exercise 4.
# Count the number of Weasleys in the list of characters, and save the result to the variable "num_weasleys".
# Use incrementation with the "x += 1" notation.

characters = ['Harry Potter', 'Ron Weasley', 'Albus Dumbledore', 'Ginny Weasley', 'Percy Weasley', 'Hermione Granger',
              'Fred Weasley', 'George Weasley']

#### YOUR CODE STARTS HERE ####

num_weasleys = 0
for character in characters:
    if "Weasley" in character:
        num_weasleys += 1

#### YOUR CODE ENDS HERE ####

print("Testing: num_weasleys = %i" % num_weasleys)
print("CORRECT" if num_weasleys == 5 else "INCORRECT")


Testing: num_weasleys = 5
CORRECT


### Testing for equality and inequality

Sometimes you want to check if two values are equal, perhaps using an `if` statement. 
To check for equality you need to use a _double_ equals sign `==`.
```
x = 5 
y = 8
if x == y:
    print "x and y are equal"
```
To check for *inequality*, i.e. if two things aren't equal, use `!=`.
```
x = 5 
y = 8
if x != y:
    print "x and y are NOT equal"
```

In [9]:
# Exercise 5.
# Use a for-loop, incrementation and equality testing to count the number of cats in my list of pets. 
# Assign the result to the variable "num_cats"

my_pets = ['cat', 'lizard', 'cat', 'dog', 'cat', 'snake', 'dog', 'cat', 'dog', 'parrot']

#### YOUR CODE STARTS HERE ####

num_cats = 0
for animal in my_pets:
    if animal == 'cat':
        num_cats += 1

#### YOUR CODE ENDS HERE ####

print("Testing: num_cats = %i" % num_cats)
print("CORRECT" if num_cats == 4 else "INCORRECT")


Testing: num_cats = 4
CORRECT


In [10]:
# Exercise 6.
# Use a for-loop, incrementation and inequality testing to count the number of pets that are neither cats nor dogs.
# Assign the result to the variable "num_unusual".

#### YOUR CODE STARTS HERE ####

num_unusual = 0
for animal in my_pets:
    if animal != 'cat' and animal != "dog":
        num_unusual += 1

#### YOUR CODE ENDS HERE ####

print("Testing: num_unusual = %i" % num_unusual)
print("CORRECT" if num_unusual == 3 else "INCORRECT")


Testing: num_unusual = 3
CORRECT


# Measure the accuracy of your rule-based classifier

Complete the function below to calculate the Precision, Recall and F1 for a given category (e.g. Food)

In [11]:
def evaluate(predictions, c):
    """This function calculate the precision, recall and F1 for a single category c (e.g. Food)
    Inputs:
        predictions: a list of (tweet, predicted_category) pairs
        c: a category 
    Returns:
        The F1 score.
    """

    # Initialize variables to count the number of true positives, false positives and false negatives
    true_positives = 0.0
    false_positives = 0.0
    false_negatives = 0.0

    # Iterate through the tweets, counting the number of true positives, false positives and false negatives
    for (tweet, predicted_category) in predictions:
        true_category = tweet.category
        if true_category == c and predicted_category == c:
            true_positives += 1
        elif true_category == c and predicted_category != c:
            false_negatives += 1
        elif true_category != c and predicted_category == c:
            false_positives += 1

    # Before we calculate Precision, Recall and F1 we need to check whether true_positives = 0. Why?
    if true_positives == 0:
        precision = 0.0
        recall = 0.0
        f1 = 0.0
    else:
        # Calculate Precision, Recall and F1
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1 = 2 * precision * recall / (precision + recall)

    # Print the category name, Precision, Recall and F1
    print(c)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)
    print("")

    # Return the F1 score
    return f1


predictions = [(tweet, classify_rb(tweet)) for tweet in test_tweets]  # Make a list of (tweet, predicted_category) pairs

# Get the F1 scores for each category
food_f1 = evaluate(predictions, "Food")
water_f1 = evaluate(predictions, "Water")
energy_f1 = evaluate(predictions, "Energy")
medical_f1 = evaluate(predictions, "Medical")
none_f1 = evaluate(predictions, "None")


Food
Precision:  0.7910447761194029
Recall:  0.8217054263565892
F1:  0.8060836501901141

Water
Precision:  0.6333333333333333
Recall:  0.95
F1:  0.7599999999999999

Energy
Precision:  0.6153846153846154
Recall:  0.4
F1:  0.4848484848484849

Medical
Precision:  1.0
Recall:  0.5384615384615384
F1:  0.7000000000000001

None
Precision:  0.5238095238095238
Recall:  0.5569620253164557
F1:  0.5398773006134969



Complete the cell below to calculate the average F1 score, which should be the average of the F1 scores for each category.

In [12]:
average_f1 = (food_f1 + water_f1 + energy_f1 + medical_f1 + none_f1) / 5
print("Average F1: ", average_f1)


Average F1:  0.6581618871304192


## Look at the confusion matrix

* _Rows_ represent the _true category_ of the tweet
* _Columns_ represent the _predicted category_ from your classifier
* So numbers on the diagonal represent correct classifications, and off-diagonal numbers represent misclassification

In [13]:
lib.show_confusion_matrix(predictions)


Unnamed: 0,Energy,Food,Medical,None,Water
Energy,16,4,0,19,1
Food,2,106,0,16,5
Medical,2,0,7,4,0
,6,24,0,44,5
Water,0,0,0,1,19


In [14]:
lib.show_predictions(predictions)


Unnamed: 0,Text,True category,Predicted category
0,fox news says romney lost because he was n't in the news at all during the week of the hurricane ... . ? yeah . ok ... ..,,
1,just got baq in the huz frm outside in the weather.. mi kuzn dem made a weather forcast video abt sandy ... thts ish too funnii ...,,
2,"a mop a bucket socks cleaning supplies latex gloves toothbrush/toothpaste ob tampons , pads trash bags plastic cups and bowls ziplock bags paper towels toilet paper sweater , sweatshirt cooking oil rice rice cooker instant oatmeal granola bars instant breakfast cereals",Food,
3,frankenstorm wo n't stop the bean ! thx for staying open for the neighbors who need coffee and treat ! ( @ the bean ) http : //t.co/zw7oa0tq,Food,
4,"i want to help clean up , run up & down stairs bring medicine to elderly , etc . am young , able , and willing to put in time .",Medical,Medical
5,the costumes in # williamsburg are boring me . # sandy took something out of # ny ... # halloween is not the same this year ...,,
6,"i am at washington dc area ( norther va ) and is organizing the donation of baby food/daipes & wipes/pet food/blanket . like to know if there is a contact person in our area that can deliver the donations to the disaster area . we are planning to do `` a bag for a family '' . donations will be in tote bags donated by someone and each bag will contain different things ( we will mark pets/babies/blankets ) . please also let me know if there are already enough any of the items mentioned above , we can ask for different items ! many thanks ! ! ! ************",Food,Food
7,supply food,Food,Food
8,"i would like to organize a drive up here in the poughkeepsie area to help staten island . clothing , non-perishable food , baby supplies , and hygiene products .",Food,Food
9,my father just said he 's leaving me and my dog during the hurricane ... like what ...,,
