### Ali Shobeiri - 260665549

In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer

### Question 1

In [2]:
def process_data(file_path):
    tr = str.maketrans("", "", string.punctuation)
    reviews = []
    scores = []
    with open(file_path, encoding="UTF-8") as f:
        for line in f.readlines():
            full_line = line.split("\t")
            text = full_line[0].translate(tr).lower()
            score = int(full_line[1].rstrip())
            reviews.append(text)
            scores.append(score)
    return np.asarray(reviews), np.asarray(scores)

def generate_dataset(file_path, data, labels, vocab_map):
    with open(file_path,'w+') as f:
        for rev, label in zip(data, labels):
            review_list = []
            review = str(rev).split()
            for word in review:
                s = vocab_map.get(word, None)
                if s is not None:
                    review_list += [str(s)]
            joined = ' '.join(review_list)
            f.write("{}\t{}\n".format(joined, label))
            
def generate_vocab(file_path, vectorizer, train_vector):
    words = vectorizer.get_feature_names()
    freq = np.asarray(train_vector.sum(axis=0))[0]
    with open(file_path,'w+') as f:
        for i, word in enumerate(words):
            f.write("{}\t{}\t{}\n".format(word, i, freq[i]))

In [3]:
yelp_train_d, yelp_train_labels = process_data("./hwk3_datasets/yelp-train.txt")
yelp_test_d, yelp_test_labels= process_data("./hwk3_datasets/yelp-test.txt")
yelp_valid_d, yelp_valid_labels = process_data("./hwk3_datasets/yelp-valid.txt")

imdb_train_d, imdb_train_labels = process_data("./hwk3_datasets/imdb-train.txt")
imdb_test_d, imdb_test_labels = process_data("./hwk3_datasets/imdb-test.txt")
imdb_valid_d, imdb_valid_labels = process_data("./hwk3_datasets/imdb-valid.txt")

In [4]:
# We set binary equals True because we are looking for binary bag of words
yelp_vectorizer = CountVectorizer(max_features=10000, binary=True) 

yelp_train = yelp_vectorizer.fit_transform(yelp_train_d)
yelp_test = yelp_vectorizer.transform(yelp_test_d)
yelp_valid = yelp_vectorizer.transform(yelp_valid_d)
yelp_vocab_map = yelp_vectorizer.vocabulary_

generate_dataset('yelp-train.txt', yelp_train_d, yelp_train_labels, yelp_vocab_map)
generate_dataset('yelp-test.txt', yelp_test_d, yelp_test_labels, yelp_vocab_map)
generate_dataset('yelp-valid.txt', yelp_valid_d, yelp_valid_labels, yelp_vocab_map)
generate_vocab("yelp-vocab.txt", yelp_vectorizer, yelp_train)

imdb_vectorizer = CountVectorizer(max_features=10000, binary=True) 

imdb_train = imdb_vectorizer.fit_transform(imdb_train_d)
imdb_test = imdb_vectorizer.transform(imdb_test_d)
imdb_valid = imdb_vectorizer.transform(imdb_valid_d)
imdb_vocab_map = imdb_vectorizer.vocabulary_

generate_dataset('imdb-train.txt', imdb_train_d, imdb_train_labels, imdb_vocab_map)
generate_dataset('imdb-test.txt', imdb_test_d, imdb_test_labels, imdb_vocab_map)
generate_dataset('imdb-valid.txt', imdb_valid_d, imdb_valid_labels, imdb_vocab_map)
generate_vocab("imdb-vocab.txt", imdb_vectorizer, imdb_train)

In [5]:
unique_train, counts_train = np.unique(yelp_train_labels, return_counts=True)
unique_test, counts_test = np.unique(yelp_test_labels, return_counts=True)
unique_valid, counts_valid = np.unique(yelp_valid_labels, return_counts=True)

yelp_train_freq = list(zip(unique_train, counts_train))
yelp_test_freq = list(zip(unique_test, counts_test))
yelp_valid_freq = list(zip(unique_valid, counts_valid))


print("Yelp Train")
for label, count in yelp_train_freq:
    print("Label: ", label, "Count: ", count)
print("\n")
    
print("Yelp Test")
for label, count in yelp_test_freq:
    print("Label: ", label, "Count: ", count)
print("\n")
    
print("Yelp Valid")
for label, count in yelp_valid_freq:
    print("Label: ", label, "Count: ", count)
print("\n")

Yelp Train
Label:  1 Count:  522
Label:  2 Count:  641
Label:  3 Count:  997
Label:  4 Count:  2468
Label:  5 Count:  2372


Yelp Test
Label:  1 Count:  143
Label:  2 Count:  190
Label:  3 Count:  300
Label:  4 Count:  702
Label:  5 Count:  665


Yelp Valid
Label:  1 Count:  84
Label:  2 Count:  96
Label:  3 Count:  164
Label:  4 Count:  356
Label:  5 Count:  300




In [6]:
unique_train, counts_train = np.unique(imdb_train_labels, return_counts=True)
unique_test, counts_test = np.unique(imdb_test_labels, return_counts=True)
unique_valid, counts_valid = np.unique(imdb_valid_labels, return_counts=True)

yelp_train_freq = list(zip(unique_train, counts_train))
yelp_test_freq = list(zip(unique_test, counts_test))
yelp_valid_freq = list(zip(unique_valid, counts_valid))


print("Imdb Train")
for label, count in yelp_train_freq:
    print("Label: ", label, "Count: ", count)
print("\n")
    
print("Imdb Test")
for label, count in yelp_test_freq:
    print("Label: ", label, "Count: ", count)
print("\n")
    
print("Imdb Valid")
for label, count in yelp_valid_freq:
    print("Label: ", label, "Count: ", count)
print("\n")

Imdb Train
Label:  0 Count:  7500
Label:  1 Count:  7500


Imdb Test
Label:  0 Count:  12500
Label:  1 Count:  12500


Imdb Valid
Label:  0 Count:  5000
Label:  1 Count:  5000




### Question 2

### Yelp BBoW

In [7]:
import string, random, os
import sklearn.naive_bayes
from sklearn import svm, metrics
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
random = DummyClassifier(strategy="uniform", random_state=69)
majority = DummyClassifier(strategy="most_frequent")

# Yelp Train
random.fit(yelp_train, yelp_train_labels)
majority.fit(yelp_train, yelp_train_labels)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [9]:
print("Train: Dummy Uniform F1 Measure: ", f1_score(yelp_train_labels, 
                                    random.predict(yelp_train), 
                                    average="micro"))
print("Train: Dummy Majority F1 Measure: ", f1_score(yelp_train_labels, 
                                     majority.predict(yelp_train), 
                                     average="micro"))

print("Test: Dummy Uniform F1 Measure: ", f1_score(yelp_test_labels, 
                                    random.predict(yelp_test), 
                                    average="micro"))
print("Test: Dummy Majority F1 Measure: ", f1_score(yelp_test_labels, 
                                     majority.predict(yelp_test), 
                                     average="micro"))

print("Valid: Dummy Uniform F1 Measure: ", f1_score(yelp_valid_labels, 
                                    random.predict(yelp_valid), 
                                    average="micro"))
print("Valid: Dummy Majority F1 Measure: ", f1_score(yelp_valid_labels, 
                                     majority.predict(yelp_valid), 
                                     average="micro"))

Train: Dummy Uniform F1 Measure:  0.1945714285714286
Train: Dummy Majority F1 Measure:  0.3525714285714286
Test: Dummy Uniform F1 Measure:  0.2045
Test: Dummy Majority F1 Measure:  0.351
Valid: Dummy Uniform F1 Measure:  0.197
Valid: Dummy Majority F1 Measure:  0.356


In [10]:
def eval_model(model, dataset_type, train_type):
    if type(model) != sklearn.naive_bayes.GaussianNB:
        if dataset_type == "YELP":
            train = yelp_train
            train_label = yelp_train_labels
            test = yelp_test
            test_label = yelp_test_labels
            valid = yelp_valid
            valid_label = yelp_valid_labels
            avg = "micro"
        elif dataset_type == "IMDB":
            train = imdb_train
            train_label = imdb_train_labels
            test = imdb_test
            test_label = imdb_test_labels
            valid = imdb_valid
            valid_label = imdb_valid_labels
            avg = "binary"

        else:
            return -1
    else:
        if dataset_type == "YELP":
            train = yelp_train.toarray()
            train_label = yelp_train_labels
            test = yelp_test.toarray()
            test_label = yelp_test_labels
            valid = yelp_valid.toarray()
            valid_label = yelp_valid_labels
            avg = "micro"
        elif dataset_type == "IMDB":
            train = imdb_train.toarray()
            train_label = imdb_train_labels
            test = imdb_test.toarray()
            test_label = imdb_test_labels
            valid = imdb_valid.toarray()
            valid_label = imdb_valid_labels
            avg = "binary"

        else:
            return -1
    
    model.fit(train, train_label)
    print("F1 Measure:")
    if train_type == "TRAIN" or train_type == "ALL":
        train_pred = model.predict(train)
        train_f1 = f1_score(train_label, train_pred, average=avg)
        print("Train F1: ", train_f1)
    if train_type == "TEST" or train_type == "ALL":
        test_pred = model.predict(test)
        test_f1 = f1_score(test_label, test_pred, average=avg)
        print("Test F1: ", test_f1)
    if train_type == "VALID" or train_type == "ALL": 
        valid_pred = model.predict(valid)
        valid_f1 = f1_score(valid_label, valid_pred, average=avg)
        print("Validation F1: ", valid_f1)
        
    if train_type == "ALL": 
        return train_f1, test_f1, valid_f1
    elif train_type == "VALID":
        return valid_f1
    elif train_type == "TEST":
        return test_f1

In [11]:
eval_model(random, "YELP", "BOTH")

F1 Measure:


In [12]:
bayes = BernoulliNB
dtree = DecisionTreeClassifier

nbayes_params = ParameterGrid({'alpha': [0.001, 0.01, 0.05, .1, 0.15, .5,1,2]})
dtree_params = ParameterGrid({'random_state': [69],
                             'criterion': ['gini','entropy'],
                             'max_features': [None, "sqrt", "log2"], 
                             'max_depth': [None, 10, 50, 100, 500],
                             'min_samples_leaf': [1, 0.2, 0.49],
                             'min_samples_split': [2, 3, 4, 5, 10]})
svm_params = ParameterGrid({'random_state': [69],
                            'loss': ['hinge','squared_hinge'],
                            'C': [.5, 2.0, 5.0, 50.0]})

classifier_list = [bayes, dtree, svm.LinearSVC]
class_labels = ["Naive Bayes", "Decision Tree", "Linear SVM"]
param_list = [nbayes_params, dtree_params, svm_params]

classifiers = zip(classifier_list, param_list, class_labels)

In [13]:
best_params_list = []
for classifier, params, labels in classifiers:
    best_score = 0
    best_params = None
    print(labels)
    for param in params:
        if param:
            print("Params are: ", param)
        score = eval_model(classifier(**param), "YELP", "VALID")
        best_score = max(score, best_score)
        if best_score == score:
            best_params = param
        print("\n")
    best_params_list.append(best_params)

Naive Bayes
Params are:  {'alpha': 0.001}
F1 Measure:
Validation F1:  0.426


Params are:  {'alpha': 0.01}
F1 Measure:
Validation F1:  0.428


Params are:  {'alpha': 0.05}
F1 Measure:
Validation F1:  0.415


Params are:  {'alpha': 0.1}
F1 Measure:
Validation F1:  0.41


Params are:  {'alpha': 0.15}
F1 Measure:
Validation F1:  0.403


Params are:  {'alpha': 0.5}
F1 Measure:
Validation F1:  0.395


Params are:  {'alpha': 1}
F1 Measure:
Validation F1:  0.388


Params are:  {'alpha': 2}
F1 Measure:
Validation F1:  0.377


Decision Tree
Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.317


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.321


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_

F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.401


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.401


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_

F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': 

F1 Measure:
Validation F1:  0.317


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.321


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.315


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.319


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.32


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_

F1 Measure:
Validation F1:  0.317


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.321


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.315


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.319


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.32


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_

F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.359


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.363


Params are:  {'criterion': 'entropy', 'max_depth': 

F1 Measure:
Validation F1:  0.409


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.412


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.41


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.413


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.411


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.363


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_fe

F1 Measure:
Validation F1:  0.35500000000000004


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.35


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.34


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.342


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.363


Params are:  {'criterion': 'entropy', 'max_depth'

F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.359


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.363


Params are:  {'criterion': 'entropy', 'max_depth': 100, 

F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.345


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.359


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.363


Params are:  {'criterion': 'entropy', 'max_depth': 500, 

F1 Measure:
Validation F1:  0.457


Params are:  {'C': 0.5, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.465


Params are:  {'C': 2.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.439


Params are:  {'C': 2.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.437


Params are:  {'C': 5.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.432


Params are:  {'C': 5.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.443


Params are:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.434


Params are:  {'C': 50.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.435




In [14]:
for classifier, params in zip(class_labels, best_params_list):
    print("Classifier: ", classifier)
    print("Best Parameters: ", params)
    print("\n")

Classifier:  Naive Bayes
Best Parameters:  {'alpha': 0.01}


Classifier:  Decision Tree
Best Parameters:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}


Classifier:  Linear SVM
Best Parameters:  {'C': 0.5, 'loss': 'squared_hinge', 'random_state': 69}




In [15]:
for classifier, param, class_label in zip(classifier_list, best_params_list, class_labels):
    print("Classifier", class_label, ": ")
    eval_model(classifier(**param), "YELP", "ALL")
    print("\n")

Classifier Naive Bayes : 
F1 Measure:
Train F1:  0.7478571428571429
Test F1:  0.4395
Validation F1:  0.428


Classifier Decision Tree : 
F1 Measure:
Train F1:  0.5115714285714286
Test F1:  0.3885
Validation F1:  0.413


Classifier Linear SVM : 
F1 Measure:
Train F1:  0.9931428571428571
Test F1:  0.4475
Validation F1:  0.465




### Question 3

In [16]:
from sklearn.preprocessing import Normalizer

yelp_vectorizer = CountVectorizer(max_features=10000) 

yelp_train = yelp_vectorizer.fit_transform(yelp_train_d)
yelp_test = yelp_vectorizer.transform(yelp_test_d)
yelp_valid = yelp_vectorizer.transform(yelp_valid_d)

norm = Normalizer(norm='l1')
yelp_train = norm.transform(yelp_train)
yelp_test = norm.transform(yelp_test)
yelp_valid = norm.transform(yelp_valid)

In [17]:
random.fit(yelp_train, yelp_train_labels)
majority.fit(yelp_train, yelp_train_labels)
print("Train: Dummy Uniform F1 Measure: ", f1_score(yelp_train_labels, 
                                    random.predict(yelp_train), 
                                    average="micro"))
print("Train: Dummy Majority F1 Measure: ", f1_score(yelp_train_labels, 
                                     majority.predict(yelp_train), 
                                     average="micro"))

print("Test: Dummy Uniform F1 Measure: ", f1_score(yelp_test_labels, 
                                    random.predict(yelp_test), 
                                    average="micro"))
print("Test: Dummy Majority F1 Measure: ", f1_score(yelp_test_labels, 
                                     majority.predict(yelp_test), 
                                     average="micro"))

print("Valid: Dummy Uniform F1 Measure: ", f1_score(yelp_valid_labels, 
                                    random.predict(yelp_valid), 
                                    average="micro"))
print("Valid: Dummy Majority F1 Measure: ", f1_score(yelp_valid_labels, 
                                     majority.predict(yelp_valid), 
                                     average="micro"))

Train: Dummy Uniform F1 Measure:  0.1945714285714286
Train: Dummy Majority F1 Measure:  0.3525714285714286
Test: Dummy Uniform F1 Measure:  0.2045
Test: Dummy Majority F1 Measure:  0.351
Valid: Dummy Uniform F1 Measure:  0.197
Valid: Dummy Majority F1 Measure:  0.356


In [18]:
bayes = GaussianNB
dtree = DecisionTreeClassifier

nbayes_params = ParameterGrid({})
dtree_params = ParameterGrid({'random_state': [69],
                             'criterion': ['gini','entropy'],
                             'max_features': [None, "sqrt", "log2"], 
                             'max_depth': [None, 10, 50, 100, 500],
                             'min_samples_leaf': [1, 0.2, 0.49],
                             'min_samples_split': [2, 3, 4, 5, 10]})
svm_params = ParameterGrid({'random_state': [69],
                            'loss': ['hinge','squared_hinge'],
                            'C': [.5, 2.0, 5.0, 50.0]})

classifier_list = [bayes, dtree, svm.LinearSVC]
class_labels = ["Naive Bayes", "Decision Tree", "Linear SVM"]
param_list = [nbayes_params, dtree_params, svm_params]

classifiers = zip(classifier_list, param_list, class_labels)

In [19]:
best_params_list = []
for classifier, params, labels in classifiers:
    best_score = 0
    best_params = None
    print(labels)
    for param in params:
        if param:
            print("Params are: ", param)
        score = eval_model(classifier(**param), "YELP", "VALID")
        best_score = max(score, best_score)
        if best_score == score:
            best_params = param
        print("\n")
    best_params_list.append(best_params)

Naive Bayes
F1 Measure:
Validation F1:  0.294


Decision Tree
Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_de

F1 Measure:
Validation F1:  0.388


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.386


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.386


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.388


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_featur

F1 Measure:
Validation F1:  0.35


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.329


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.333


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.343


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth':

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth'

F1 Measure:
Validation F1:  0.387


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.384


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_f

F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.339


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.334


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500

F1 Measure:
Validation F1:  0.451


Params are:  {'C': 2.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.465


Params are:  {'C': 5.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.467


Params are:  {'C': 5.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.488


Params are:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.497


Params are:  {'C': 50.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.493




In [20]:
for classifier, params in zip(class_labels, best_params_list):
    print("Classifier: ", classifier)
    print("Best Parameters: ", params)
    print("\n")

Classifier:  Naive Bayes
Best Parameters:  {}


Classifier:  Decision Tree
Best Parameters:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}


Classifier:  Linear SVM
Best Parameters:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}




In [21]:
for classifier, param, class_label in zip(classifier_list, best_params_list, class_labels):
    print("Classifier", class_label, ": ")
    eval_model(classifier(**param), "YELP", "ALL")
    print("\n")

Classifier Naive Bayes : 
F1 Measure:
Train F1:  0.805
Test F1:  0.3075
Validation F1:  0.294


Classifier Decision Tree : 
F1 Measure:
Train F1:  0.5302857142857142
Test F1:  0.39
Validation F1:  0.391


Classifier Linear SVM : 
F1 Measure:
Train F1:  0.7327142857142859
Test F1:  0.518
Validation F1:  0.497




### Question 4

In [22]:
random.fit(imdb_train, imdb_train_labels)
majority.fit(imdb_train, imdb_train_labels)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [23]:
print("Train: Dummy Uniform F1 Measure: ", f1_score(imdb_train_labels, 
                                    random.predict(imdb_train), 
                                    average="micro"))
print("Test: Dummy Uniform F1 Measure: ", f1_score(imdb_test_labels, 
                                    random.predict(imdb_test), 
                                    average="micro"))

print("Valid: Dummy Uniform F1 Measure: ", f1_score(imdb_valid_labels, 
                                    random.predict(imdb_valid), 
                                    average="micro"))

Train: Dummy Uniform F1 Measure:  0.4992
Test: Dummy Uniform F1 Measure:  0.5008
Valid: Dummy Uniform F1 Measure:  0.4979


In [24]:
bayes = BernoulliNB
dtree = DecisionTreeClassifier

nbayes_params = ParameterGrid({'alpha': [0.001, 0.01, 0.05, .1, 0.15, .5,1,2]})
dtree_params = ParameterGrid({'random_state': [69],
                             'criterion': ['gini','entropy'],
                             'max_features': [None, "sqrt", "log2"], 
                             'max_depth': [None, 10, 50, 100, 500],
                             'min_samples_leaf': [1, 0.2, 0.49],
                             'min_samples_split': [2, 3, 4, 5, 10]})
svm_params = ParameterGrid({'random_state': [69],
                            'loss': ['hinge','squared_hinge'],
                            'C': [.5, 2.0, 5.0, 50.0]})

classifier_list = [bayes, dtree, svm.LinearSVC]
class_labels = ["Naive Bayes", "Decision Tree", "Linear SVM"]
param_list = [nbayes_params, dtree_params, svm_params]

classifiers = zip(classifier_list, param_list, class_labels)

In [25]:
best_params_list = []
for classifier, params, labels in classifiers:
    best_score = 0
    best_params = None
    print(labels)
    for param in params:
        if param:
            print("Params are: ", param)
        score = eval_model(classifier(**param), "YELP", "VALID")
        best_score = max(score, best_score)
        if best_score == score:
            best_params = param
        print("\n")
    best_params_list.append(best_params)

Naive Bayes
Params are:  {'alpha': 0.001}
F1 Measure:
Validation F1:  0.421


Params are:  {'alpha': 0.01}
F1 Measure:
Validation F1:  0.425


Params are:  {'alpha': 0.05}
F1 Measure:
Validation F1:  0.418


Params are:  {'alpha': 0.1}
F1 Measure:
Validation F1:  0.402


Params are:  {'alpha': 0.15}
F1 Measure:
Validation F1:  0.396


Params are:  {'alpha': 0.5}
F1 Measure:
Validation F1:  0.391


Params are:  {'alpha': 1}
F1 Measure:
Validation F1:  0.383


Params are:  {'alpha': 2}
F1 Measure:
Validation F1:  0.37899999999999995


Decision Tree
Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_

F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 0.49, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.356


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.388


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_f

F1 Measure:
Validation F1:  0.35


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.35


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.329


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.333


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.343


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_feature

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth'

F1 Measure:
Validation F1:  0.387


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.384


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_f

F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.339


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.334


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500

F1 Measure:
Validation F1:  0.451


Params are:  {'C': 2.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.465


Params are:  {'C': 5.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.467


Params are:  {'C': 5.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.488


Params are:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.497


Params are:  {'C': 50.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.493




In [26]:
for classifier, params in zip(class_labels, best_params_list):
    print("Classifier: ", classifier)
    print("Best Parameters: ", params)
    print("\n")
    
for classifier, param, class_label in zip(classifier_list, best_params_list, class_labels):
    print("Classifier", class_label, ": ")
    eval_model(classifier(**param), "IMDB", "ALL")
    print("\n")

Classifier:  Naive Bayes
Best Parameters:  {'alpha': 0.01}


Classifier:  Decision Tree
Best Parameters:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}


Classifier:  Linear SVM
Best Parameters:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}


Classifier Naive Bayes : 
F1 Measure:
Train F1:  0.8718817787418656
Test F1:  0.8318656900666611
Validation F1:  0.8423615337796714


Classifier Decision Tree : 
F1 Measure:
Train F1:  0.7681204120658812
Test F1:  0.7304399173309714
Validation F1:  0.7329089399391089


Classifier Linear SVM : 
F1 Measure:
Train F1:  1.0
Test F1:  0.8307865529998392
Validation F1:  0.8381543389050146




### Question 5

In [27]:
imdb_vectorizer = CountVectorizer(max_features=10000)

imdb_train = imdb_vectorizer.fit_transform(imdb_train_d)
imdb_test = imdb_vectorizer.transform(imdb_test_d)
imdb_valid = imdb_vectorizer.transform(imdb_valid_d)

imdb_train = norm.transform(imdb_train)
imdb_test = norm.transform(imdb_test)
imdb_valid = norm.transform(imdb_valid)

In [28]:
random.fit(imdb_train, imdb_train_labels)
majority.fit(imdb_train, imdb_train_labels)
print("Train: Dummy Uniform F1 Measure: ", f1_score(imdb_train_labels, 
                                    random.predict(imdb_train), 
                                    average="micro"))
print("Test: Dummy Uniform F1 Measure: ", f1_score(imdb_test_labels, 
                                    random.predict(imdb_test), 
                                    average="micro"))

print("Valid: Dummy Uniform F1 Measure: ", f1_score(imdb_valid_labels, 
                                    random.predict(imdb_valid), 
                                    average="micro"))

Train: Dummy Uniform F1 Measure:  0.4992
Test: Dummy Uniform F1 Measure:  0.5008
Valid: Dummy Uniform F1 Measure:  0.4979


In [29]:
bayes = GaussianNB
dtree = DecisionTreeClassifier

nbayes_params = ParameterGrid({})
dtree_params = ParameterGrid({'random_state': [69],
                             'criterion': ['gini','entropy'],
                             'max_features': [None, "sqrt", "log2"], 
                             'max_depth': [None, 10, 50, 100, 500],
                             'min_samples_leaf': [1, 0.2, 0.49],
                             'min_samples_split': [2, 3, 4, 5, 10]})
svm_params = ParameterGrid({'random_state': [69],
                            'loss': ['hinge','squared_hinge'],
                            'C': [.5, 2.0, 5.0, 50.0]})

classifier_list = [bayes, dtree, svm.LinearSVC]
class_labels = ["Naive Bayes", "Decision Tree", "Linear SVM"]
param_list = [nbayes_params, dtree_params, svm_params]

classifiers = zip(classifier_list, param_list, class_labels)

In [30]:
best_params_list = []
for classifier, params, labels in classifiers:
    best_score = 0
    best_params = None
    print(labels)
    for param in params:
        if param:
            print("Params are: ", param)
        score = eval_model(classifier(**param), "YELP", "VALID")
        best_score = max(score, best_score)
        if best_score == score:
            best_params = param
        print("\n")
    best_params_list.append(best_params)

Naive Bayes
F1 Measure:
Validation F1:  0.294


Decision Tree
Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_de

F1 Measure:
Validation F1:  0.388


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.386


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.386


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.388


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 10, 'max_featur

F1 Measure:
Validation F1:  0.35


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.329


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.333


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.343


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth':

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.322


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.327


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.38499999999999995


Params are:  {'criterion': 'gini', 'max_d

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth'

F1 Measure:
Validation F1:  0.387


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.384


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.39


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 10, 'max_f

F1 Measure:
Validation F1:  0.335


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.339


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.334


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.341


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 50, 'max

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 100

F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 69}
F1 Measure:
Validation F1:  0.344


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 69}
F1 Measure:
Validation F1:  0.346


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 69}
F1 Measure:
Validation F1:  0.338


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 2, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500, 'max_features': None, 'min_samples_leaf': 0.2, 'min_samples_split': 3, 'random_state': 69}
F1 Measure:
Validation F1:  0.366


Params are:  {'criterion': 'entropy', 'max_depth': 500

F1 Measure:
Validation F1:  0.451


Params are:  {'C': 2.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.465


Params are:  {'C': 5.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.467


Params are:  {'C': 5.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.488


Params are:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.497


Params are:  {'C': 50.0, 'loss': 'squared_hinge', 'random_state': 69}
F1 Measure:
Validation F1:  0.493




In [31]:
for classifier, params in zip(class_labels, best_params_list):
    print("Classifier: ", classifier)
    print("Best Parameters: ", params)
    print("\n")
    
for classifier, param, class_label in zip(classifier_list, best_params_list, class_labels):
    print("Classifier", class_label, ": ")
    eval_model(classifier(**param), "IMDB", "ALL")
    print("\n")

Classifier:  Naive Bayes
Best Parameters:  {}


Classifier:  Decision Tree
Best Parameters:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 69}


Classifier:  Linear SVM
Best Parameters:  {'C': 50.0, 'loss': 'hinge', 'random_state': 69}


Classifier Naive Bayes : 
F1 Measure:
Train F1:  0.857504749841672
Test F1:  0.6629839176727403
Validation F1:  0.748626953950148


Classifier Decision Tree : 
F1 Measure:
Train F1:  0.769124046162578
Test F1:  0.7244016964556194
Validation F1:  0.7267712791250595


Classifier Linear SVM : 
F1 Measure:
Train F1:  0.8977054817165907
Test F1:  0.866031746031746
Validation F1:  0.8646623952686052


