In [4]:
import csv
import sys

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

TEST_SIZE = 0.4


def main():

    # Check command-line arguments
    #if len(sys.argv) != 2:
    #    sys.exit("Usage: python shopping.py data")

    # Load data from spreadsheet and split into train and test sets
    #evidence, labels = load_data(sys.argv[1])
    evidence, labels = load_data("shopping.csv")
    X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=TEST_SIZE
    )

    # Train model and make predictions
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    sensitivity, specificity = evaluate(y_test, predictions)

    # Print results
    print(f"Correct: {(y_test == predictions).sum()}")
    print(f"Incorrect: {(y_test != predictions).sum()}")
    print(f"True Positive Rate: {100 * sensitivity:.2f}%")
    print(f"True Negative Rate: {100 * specificity:.2f}%")


def load_data(filename):
    """
    Load shopping data from a CSV file `filename` and convert into a list of
    evidence lists and a list of labels. Return a tuple (evidence, labels).

    evidence should be a list of lists, where each list contains the
    following values, in order:
        - Administrative, an integer
        - Administrative_Duration, a floating point number
        - Informational, an integer
        - Informational_Duration, a floating point number
        - ProductRelated, an integer
        - ProductRelated_Duration, a floating point number
        - BounceRates, a floating point number
        - ExitRates, a floating point number
        - PageValues, a floating point number
        - SpecialDay, a floating point number
        - Month, an index from 0 (January) to 11 (December)
        - OperatingSystems, an integer
        - Browser, an integer
        - Region, an integer
        - TrafficType, an integer
        - VisitorType, an integer 0 (not returning) or 1 (returning)
        - Weekend, an integer 0 (if false) or 1 (if true)

    labels should be the corresponding list of labels, where each label
    is 1 if Revenue is true, and 0 otherwise.
    """
    # CG: initiate resulting lists:
    evidence = list()
    labels = list()

    # CG: dictionary mapping indexes to names of the months:
    months = dict ({"Jan":0, "Feb":1, "Mar":2, "Apr":3, "May":4, "Jun":5, "Jul":6, "Aug":7, "Sep":8, "Oct":9, "Nov":10, "Dec":11})

    # CG: open the file:
    with open(filename) as f:

        # CG: load data into a dictionary:
        reader = csv.DictReader(f)

        # CG: treat each row and feed it to the expected list:
        for row in reader:

            # CG: load all revenue labels:
            labels.append(1 if row["Revenue"] == "TRUE" else 0)

            # CG: load all evidence data according to its data type:
            evidence.append ([int(row["Administrative"]), 
                            float(row["Administrative_Duration"]), 
                              int(row["Informational"]),
                            float(row["Informational_Duration"]),
                              int(row["ProductRelated"]),
                            float(row["ProductRelated_Duration"]),
                            float(row["BounceRates"]),
                            float(row["ExitRates"]),
                            float(row["PageValues"]),
                            float(row["SpecialDay"]),
                           months[row["Month"][:3]],
                              int(row["OperatingSystems"]),
                              int(row["Browser"]),
                              int(row["Region"]),
                              int(row["TrafficType"]),
                             1 if row["VisitorType"] == "Returning_Visitor" else 0,
                             1 if row["Weekend"] == "TRUE" else 0])

    # CG: return the data:
    return (evidence, labels)


def train_model(evidence, labels):
    """
    Given a list of evidence lists and a list of labels, return a
    fitted k-nearest neighbor model (k=1) trained on the data.
    """
    # CG: create a KNeighborsClassifier instance:
    KNC = KNeighborsClassifier(n_neighbors=1)

    # CG: fit the training data to the classifier:
    KNC.fit(evidence, labels)

    # CG: return the classifier:
    return KNC


def evaluate(labels, predictions):
    """
    Given a list of actual labels and a list of predicted labels,
    return a tuple (sensitivity, specificity).

    Assume each label is either a 1 (positive) or 0 (negative).

    `sensitivity` should be a floating-point value from 0 to 1
    representing the "true positive rate": the proportion of
    actual positive labels that were accurately identified.

    `specificity` should be a floating-point value from 0 to 1
    representing the "true negative rate": the proportion of
    actual negative labels that were accurately identified.
    """
    # CG: initialize variables:
    pos_count = 0
    neg_count = 0
    true_pos_count = 0
    true_neg_count = 0

    # CG: loop over all labels:
    #for index in range(len(labels)):
    for label in labels:
        # CG: check if it's positive or negative, and increment respective count:
        if label:
            pos_count += 1
        else:
            neg_count += 1

        # CG: check if prediction matches:
        if labels[index] == predictions[index]:
            # CG: then check if matches to positive or negative, and then increment respective count:
            if labels[index]:
                true_pos_count += 1
            else:
                true_neg_count += 1

    # CG: compute sensitivity and specificity:
    try:
        sensitivity = true_pos_count / pos_count
        specificity = true_neg_count / neg_count
    except:
        print ("Given sets have invalid data!")
        raise ZeroDivisionError

    # CG: return the resulting values:
    return sensitivity, specificity


if __name__ == "__main__":
    main()


Correct: 4056
Incorrect: 876
True Positive Rate: 37.40%
True Negative Rate: 90.53%
