# Simple Machine Learning Example

In [None]:
import numpy as np
import pandas as pd

# Rotten Tomatoes Classifier
#### Rotten Tomatoes is a popular website for movie ratings. They provide an audience score where anybody can review the movie. They also have a critic score. These scores are calculated by looking at the percentage of people that gave the movie a positive rating. For example, if a movie has 90 positive critic reviews, 10 negative critic reviews, 60 positive audience reviews, and 40 negative audience reviews then the critic score would be 0.9 and the audience score would be 0.6. If a movie has a critic score above 0.6 then it is classified as "fresh". Our goal is to predict whether a movie will get a fresh critic score.

## Training Data
* Name: name of the movie
* Audience Score: Rotten Tomatoes audience score
* Audience Ratings: Number of audience reviewers contributing to the score
* Budget: budget for the movie
* Box Office: Domestic earnings + International earnings from the movie
* Genre: One of (action, thriller, horror, animation, adventure, musical, comedy)
* Rating: One of (PG, PG-13, R)
* Is Fresh: What we are trying to predict

#### Training Data Table:

| Name                                    | audience_score | audience_ratings | budget | box_office | genre     | rating | is_fresh? |
|-----------------------------------------|----------------|------------------|--------|------------|-----------|--------|-----------|
| Jurassic World : Fallen Kingdom         | 0.49           | 29,091           | 170M   | 1.309B     | action    | PG-13  | no        |
| Venom                                   | 0.81           | 38,684           | 100M   | 856M       | action    | PG-13  | no        |
| Fantastic Beasts: Crimes of Grindelwald | 0.56           | 14,038           | 200M   | 653M       | adventure | PG-13  | no        |
| The Meg                                 | 0.46           | 6,344            | 130M   | 530M       | thriller  | PG-13  | no        |
| The Nun                                 | 0.36           | 6,506            | 22M    | 365M       | horror    | R      | no        |
| Black Panther                           | 0.76           | 88,211           | 200M   | 1.346B     | action    | PG-13  | yes       |
| Incredibles 2                           | 0.85           | 17,343           | 200M   | 1.242B     | animation | PG     | yes       |
| Bohemian Rhapsody                       | 0.86           | 21,731           | 52M    | 903M       | drama     | PG-13  | yes       |
| A Star is Born                          | 0.80           | 18,862           | 36M    | 434M       | drama     | R      | yes       |
| A Quiet Place                           | 0.83           | 21,439           | 17M    | 340M       | horror    | PG-13  | yes       |

### Loading In Training Data

In [None]:
def get_data(filename):
    data = pd.read_csv(filename)
    y = data['is_fresh'].values.tolist()
    x = data.drop(['is_fresh'], axis=1)
    return (x,y)

In [None]:
(train_x, train_y) = get_data("movie_train.csv") # loading data from csv file
train_rows = len(train_x) # getting number of training examples

In [None]:
train_x.head(3) # print out first 3 entries in data frame

In [None]:
train_y[0:3] # print out first 3 labels for training data

### How to Index into Pandas Dataframe

In [None]:
# Getting the 3rd row of data:
i = 2 # remember that python starts counting at 0
row_i = train_x.iloc[i]
print("Row i:")
print(row_i)
print("******")
# Getting name and audience score for row_i
print("Audience score for %s is %s" % (row_i['name'], row_i['audience_score']))
print("******")
# One line for above
print("Audience score for %s is %s" % (train_x.iloc[i]['name'], train_x.iloc[i]['audience_score']))

### Defining a Performance Metric

In [None]:
"""
predicted_is_fresh : array of size n with predictions
actual_is_fresh : array of size n with actual values
returns: accuracy score from 0 to 1
"""
def get_accuracy(predicted_is_fresh, actual_is_fresh):
    if (len(predicted_is_fresh) != len(actual_is_fresh)):
        return "invalid_inputs"
    total_entries = 1.0*len(predicted_is_fresh)
    total_correct = 0
    for i in range(len(predicted_is_fresh)):
        if predicted_is_fresh[i] == actual_is_fresh[i]:
            total_correct += 1
    return total_correct/total_entries

In [None]:
def get_true_positive_rate(predicted_is_fresh, actual_is_fresh):
    if (len(predicted_is_fresh) != len(actual_is_fresh)):
        return "invalid_inputs"
    total_positive = 0
    total_correct_positive = 0
    for i in range(len(predicted_is_fresh)):
        if actual_is_fresh[i] == "yes":
            total_positive += 1
            if predicted_is_fresh[i] == actual_is_fresh[i]:
                total_correct_positive += 1
    return 1.0*total_correct_positive/total_positive

In [None]:
def get_true_negative_rate(predicted_is_fresh, actual_is_fresh):
    # TODO
    return 0

#### TODO: We want to test the functions above, what should go into the accuracy variable?

In [None]:
## test for function above
predicted_is_fresh = ['yes','yes','yes','no']
actual_is_fresh = ['yes','yes','no','no']
accuracy = 0 # TODO: What should this be?
assert(get_accuracy(predicted_is_fresh, actual_is_fresh) == accuracy)

In [None]:
true_positve_rate = 1
assert(get_true_positive_rate(predicted_is_fresh, actual_is_fresh) == true_positve_rate)

In [None]:
true_negative_rate = 0.5
assert(get_true_negative_rate(predicted_is_fresh, actual_is_fresh) == true_negative_rate)

## Building the Classification Algorithm

In [None]:
### TODO: Make this better
def classify_row(row):
    if (row['rating'] == 'PG-13'):
        return "no"
    else:
        return "yes"

In [None]:
def classify_all_data(data):
    predictions = []
    for i in range(len(data)):
        predictions.append(classify_row(data.iloc[i]))
    return predictions

In [None]:
predictions_train = classify_all_data(train_x)
print("predictions:  " + str(predictions_train))
print("actual values:" + str(train_y))
accuracy_train = get_accuracy(predictions_train,train_y)
print("Training Accuracy: %0.2f" % accuracy_train)
true_positive_rate_train = get_true_positive_rate(predictions_train,train_y)
print("True Positive Rate: %0.2f" % true_positive_rate_train)
true_negative_rate_train = get_true_negative_rate(predictions_train,train_y)
print("True Negative Rate: %0.2f" % true_negative_rate_train)

## Now Lets See Testing Accuracy

In [None]:
(test_x, test_y) = get_data("movie_test.csv") # loading data from csv file

In [None]:
predictions_test = classify_all_data(test_x)
print("Testing Accuracy: %0.2f" % get_accuracy(predictions_test,test_y))
print("Testing True Positive Rate: %0.2f" % get_true_positive_rate(predictions_test,test_y))
print("Testing True Negative Rate: %0.2f" % get_true_negative_rate(predictions_test,test_y))