# Assignment 2 - Part C: Trying alternative classifiers

This is a skeleton for trying alternative classifiers on the basketball dataset.

In [192]:
import csv

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier, BaggingClassifier 



| Method | Accuracy | Error rate |
| --- | --- | --- | --- | --- | --- | --- | --- |
| AdaBoostClassifier | 0.5962765527824893 | 0.4037234472175107 |
| Nearest Neighbors | 0.5440998303852678 | 0.45590016961473223 |
| Naive Bayes (Gaussian) | 0.5679266618205314 | 0.4320733381794685 |
| Random Forests(n_estimators=1000, max_features=5)| 0.5869073580486229 | 0.4130926419513771 |     
|GradientBoosting| 0.6015669170503191 | 0.39843308294968094 
|GradientBoosting(learning_rate=0.001,n_estimators=10000)| 0.6024553751716339 | 0.39754462482836606 |
|VotingClassifier(estimators=[('AdaBoost', AdaBoostClassifier(n_estimators=100)), ('RandomForest', RandomForestClassifier(n_estimators=1000, max_features=5)), ('GradientBoosting', GradientBoostingClassifier(learning_rate=0.001,n_estimators=10000))], voting='hard')| 0.6016880704304983 | 0.3983119295695017 |
|GradientBoostingClassifier(max_depth=4)| 0.6005573055488248 | 0.3994426944511752 | Current best |
|GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50, max_depth=4, max_features='sqrt', subsample=0.8, random_state=8)| 0.60314191099265 | 0.39685808900735 |

We can define, as done in Practicum 6, a data loading in a way to obtain the attributes set and class labels for each the training and the test sets.

In [193]:
ATTRS = ["LOCATION", "W", "FINAL_MARGIN", "SHOT_NUMBER", "PERIOD", "GAME_CLOCK", "SHOT_CLOCK", "DRIBBLES", "TOUCH_TIME",
         "SHOT_DIST", "PTS_TYPE", "CLOSE_DEF_DIST", "SHOT_RESULT"]
ATTRS_WO_CLASS = 12

def load_data(filename):
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS + 1:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  # first ATTRS_WO_CLASS values are attributes
                label = row[ATTRS_WO_CLASS]  # (ATTRS_WO_CLASS + 1)th value is the class label
                if i % 5 == 0:  # test instance
                    test_x.append(instance)
                    test_y.append(label)
                else:  # train instance
                    train_x.append(instance)
                    train_y.append(label)
            
                
                    
    return train_x, train_y, test_x, test_y



In [194]:
def load_test(filename):
    test_z = []
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            test_z.append(row)
    return test_z

And then we can use it to load the data.

In [195]:
train_x, train_y, test_x, test_y = load_data("data/basketball.train.csv")




In [196]:
test_z = load_test("data/basketball.test.csv")


Scikit-learn needs that all the attribute values to be numeric. This is, we need to binarize all the non-numeric attribute values, to obtain vectors: records having only numbers. The `DictVectorizer` class provided by scikit-learn allows to do this easily.

In [197]:
from sklearn.feature_extraction import DictVectorizer

Mind that each `train_x` and `test_x` are a list of lists.

We just need to obtain from each a list of dictionaries (as done in previous practica where each record was a dictionary).

In [198]:
dicts_train_x = []
for x in train_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            # TODO: save as floats the values for the already-numeric attributes from dataset, keep the rest as the strings they are
            if i not in [0, 1, 4, 10]:
                val = float(val)
            d[attr]=val
    dicts_train_x.append(d)

Finally, the `fit_transform` method of the vectorizer binarizes the non-numeric attributes in the list of dictionaries, and returns the vector we need.

In [199]:
vectorizer_train = DictVectorizer()
vec_train_x = vectorizer_train.fit_transform(dicts_train_x).toarray()

We do similarly for vectorizing `test_x`.

In [200]:
dicts_test_x = []
# TODO
for x in test_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            # TODO: save as floats the values for the already-numeric attributes from dataset, keep the rest as the strings they are
            if i not in [0, 1, 4, 10]:
                val = float(val)
            d[attr]=val
    dicts_test_x.append(d)
  


In [201]:
vectorizer_test = DictVectorizer()
vec_test_x = vectorizer_test.fit_transform(dicts_test_x).toarray()

In [202]:
dicts_test_z = []
for x in test_z:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS)-1:
            val = x[i]
            # TODO: save as floats the values for the already-numeric attributes from dataset, keep the rest as the strings they are
            if i not in [0, 1, 4, 10]:
                val = float(val)
            d[attr]=val
    dicts_test_z.append(d)
  

In [203]:
vectorizer_test = DictVectorizer()
vec_test_z = vectorizer_test.fit_transform(dicts_test_z).toarray()

Having `evaluate` defined somewhere, we are ready to learn and apply the model, similarly to Task 3 of Practicum 6. But here, we use the vectors recently obtained for the input sets. E.g., for Naive Bayes classifier:

In [204]:
def evaluate(predictions, true_labels):
    correct = 0
    incorrect = 0
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            correct += 1
        else:
            incorrect += 1

    print("\tAccuracy:   ", correct / len(predictions))
    print("\tError rate: ", incorrect / len(predictions))
    print("|",correct / len(predictions),"|",incorrect / len(predictions), "|")
    return correct
    



In [205]:

classifiers = {"AdaBoostClassifier": AdaBoostClassifier(n_estimators=100),
               "Nearest Neighbors": KNeighborsClassifier(n_neighbors=200),
               "Naive Bayes (Gaussian)": GaussianNB(),
               "Random Forests": RandomForestClassifier(),  # number of trees in the forest, and maximum number of features in each tree
               "GradientBoosting": GradientBoostingClassifier(learning_rate=0.1, min_samples_split=600, min_samples_leaf=55, max_depth=4, max_features='sqrt', subsample=0.8, random_state=8),
               "ExtraTrees":ExtraTreesClassifier(), 
               "Bagging":BaggingClassifier(),
               "Voting":VotingClassifier(estimators=[("Nearest Neighbors", KNeighborsClassifier(n_neighbors=200)),("Naive Bayes (Gaussian)", GaussianNB()),('AdaBoost', AdaBoostClassifier()), ('RandomForest', RandomForestClassifier()), ('GradientBoosting', GradientBoostingClassifier())], voting='hard', weights=[4,5,2,5,2]), 
              }

for name, clf in classifiers.items():
    print(name)
    print("**learning in progress**")
    clf.fit(vec_train_x, train_y)
    predictions = clf.predict(vec_test_z)    #Evaluate with test_x, build model with test_z
#     evaluate(predictions,test_y)     #uncomment when testing classifiers with vec_test_x in predicitions
    OUTPUT_FILE = "data/prediction{}.csv".format(name)
    with open(OUTPUT_FILE, "w") as fout:
        fout.write("Id,Target\n")
        for i, record in enumerate(predictions):           
            fout.write(str(i+1) + "," + record + "\n")
        print("Done")

AdaBoostClassifier
**learning in progress**
Done
Nearest Neighbors
**learning in progress**
Done
Naive Bayes (Gaussian)
**learning in progress**
Done
Random Forests
**learning in progress**
Done
GradientBoosting
**learning in progress**
Done
ExtraTrees
**learning in progress**
Done
Bagging
**learning in progress**
Done
Voting
**learning in progress**
Done
