In [2]:
import csv
import sys

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Python standard library modules
import datetime

TEST_SIZE = 0.4


def main():

    # Check command-line arguments
    #if len(sys.argv) != 2:
    #    sys.exit("Usage: python shopping.py data")

    # Load data from spreadsheet and split into train and test sets
    #evidence, labels = load_data(sys.argv[1])
    evidence, labels = load_data("shopping.csv")
    X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=TEST_SIZE
    )

    # Train model and make predictions
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    sensitivity, specificity = evaluate(y_test, predictions)

    # Print results
    print(f"Correct: {(y_test == predictions).sum()}")
    print(f"Incorrect: {(y_test != predictions).sum()}")
    print(f"True Positive Rate: {100 * sensitivity:.2f}%")
    print(f"True Negative Rate: {100 * specificity:.2f}%")


def load_data(filename):
    """
    Load shopping data from a CSV file `filename` and convert into a list of
    evidence lists and a list of labels. Return a tuple (evidence, labels).
    evidence should be a list of lists, where each list contains the
    following values, in order:
        - Administrative, an integer
        - Administrative_Duration, a floating point number
        - Informational, an integer
        - Informational_Duration, a floating point number
        - ProductRelated, an integer
        - ProductRelated_Duration, a floating point number
        - BounceRates, a floating point number
        - ExitRates, a floating point number
        - PageValues, a floating point number
        - SpecialDay, a floating point number
        - Month, an index from 0 (January) to 11 (December)
        - OperatingSystems, an integer
        - Browser, an integer
        - Region, an integer
        - TrafficType, an integer
        - VisitorType, an integer 0 (not returning) or 1 (returning)
        - Weekend, an integer 0 (if false) or 1 (if true)
    labels should be the corresponding list of labels, where each label
    is 1 if Revenue is true, and 0 otherwise.
    """
    # Read file's data
    with open(filename) as f:
        reader = csv.reader(f)
        next(reader)

        evidence = []
        labels = []
        for row in reader:
            # Ignore first header row
            # Add user evidence and label
            e = []
            
            # Return month as 0-11
            if row[10] != 'June':
                month = datetime.datetime.strptime(row[10], "%b").month - 1
            else:
                # June
                month = 5
            # Returning visitor
            if row[15] == 'Returning_Visitor':
                VisitorType = 1
            else:
                VisitorType = 0
            # Weekend
            if row[16] == 'FALSE':
                Weekend = 0
            else: 
                Weekend = 1
            # Purchase
            if row[17] == 'FALSE':
                Revenue = 0
            else: 
                Revenue = 1

            e.extend([int(row[0]), float(row[1]), 
                    int(row[2]), float(row[3]), 
                    int(row[4]), float(row[5]), 
                    float(row[6]), float(row[7]), 
                    float(row[8]), float(row[9]),
                    int(month), 
                    int(row[11]), int(row[12]),
                    int(row[13]), int(row[14]),
                    int(VisitorType), int(Weekend)])
            evidence.append(e)
            labels.append(int(Revenue))
        
        return (evidence, labels)

def train_model(evidence, labels):
    """
    Given a list of evidence lists and a list of labels, return a
    fitted k-nearest neighbor model (k=1) trained on the data.
    """
    # Train model on training set
    model = KNeighborsClassifier(n_neighbors = 1)
    model.fit(evidence, labels)
    return model

def evaluate(labels, predictions):
    """
    Given a list of actual labels and a list of predicted labels,
    return a tuple (sensitivity, specificty).
    Assume each label is either a 1 (positive) or 0 (negative).
    `sensitivity` should be a floating-point value from 0 to 1
    representing the "true positive rate": the proportion of
    actual positive labels that were accurately identified.
    `specificity` should be a floating-point value from 0 to 1
    representing the "true negative rate": the proportion of
    actual negative labels that were accurately identified.
    """
    total_positives = labels.count(1)
    total_negatives = labels.count(0)
    #print(total_positives, total_negatives)

    predicted_positives = 0
    predicted_negatives = 0
    for n in range(len(labels)):
        if predictions[n] == labels[n]:
            # True positive
            if predictions[n] == 1:
                predicted_positives += 1
            # True negative
            if labels[n] == 0:
                predicted_negatives += 1
    #print(predicted_positives, predicted_negatives)

    sensitivity = predicted_positives / total_positives
    specificity = predicted_negatives / total_negatives

    return (sensitivity, specificity)


if __name__ == "__main__":
    main()

Correct: 4078
Incorrect: 854
True Positive Rate: 37.84%
True Negative Rate: 90.60%
