In [1]:
import sys
from re import escape
from config import *
from svm import *
from ensembling import *
from utils import *
import warnings
warnings.filterwarnings('ignore')

pre_config= PRE_PROCESSING_CONFIG
config_name= "AdaBoost"
required_config_for_preprocessing = pre_config[config_name]


In [2]:
## initially I manipulated threshold variance by hand because i didnt think of changing utils
# later I changed utils in order to incorporate that into grid search


if __name__ == '__main__':
    # Preprocess the datasets
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}

In [3]:
# Optimal number of trees in AdaBoost

train_dataset = train_dict
val_dataset = val_dict

best_score = 0
best_model = None
for n_trees in [36]:
    model=AdaBoostClassifier(num_trees=n_trees)
    model.fit(train_dataset["X"], train_dataset["y"])
    val_results = model.predict(val_dataset['X'])
    val_score_model, accuracy = val_score(val_dataset['y'], val_results)
    print(f"Score for {n_trees} trees: {val_score_model}")
    print(f"Accuracy for {n_trees} trees: {accuracy}")
    
    
# validation f1 score strictly increases with increase in number of trees
# variance 0.9 with 40 trees gave validation score of 0.89 and with 10 trees gave 0.856. However n trees took about 12*n seconds.
# variance 0.4 with 40 trees gave validation score of 0.857 and with 10 trees gave 0.852. However n trees took about n/10 seconds.
# variance 0.7 with 35 trees gave validation score of 0.86 taking 40 seconds.
# 0.8 with 40 trees gives 0.8737 but takes 140 seconds
# 0.8 with 35 gives 0.8739 but takes 1m 54 secs
# 0.8 with 37 gives 0.8736 but takes 120 sec
       







Fitting with 36 trees
Score for 36 trees: 0.8752125255048
Accuracy for 36 trees: 0.914375


In [None]:
# random forest gridsearch.  
# all the parameters were not feasible at the same time hence I first optimized forest parameters and later tree parameters (assuming they have less correlation)
# HAD TO MODIFY UTILS IN ORDER TO RUN THESE AND FURTHER CELLS
best_model=[]
best_score=0

for thr in [0.35]:
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing, threshold=thr)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing, threshold=thr)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    
    for n_trees in [20]:
        for bootstrap_fraction in [0.3]:
            for weighted_forest in [True]:
                for min_samples_split in [7]:
                    for max_depth in [20]:
                        for min_gain in [0.2]:
                            model=RandomForestClassifier(num_trees=n_trees, bootstrap_fraction=bootstrap_fraction,min_samples_split=min_samples_split,max_depth=max_depth,min_gain=min_gain, weighted_forest=weighted_forest)
                            model.fit(train_dict["X"], train_dict["y"])
                            val_results = model.predict(val_dict['X'])
                            val_score_model = val_score(val_dict['y'], val_results)
                            if val_score_model > best_score:
                                best_score = val_score_model
                                best_model = [min_samples_split,max_depth,min_gain,weighted_forest,best_score]
                                print(best_model[0],best_model[1],best_model[2],best_model[3],best_model[4])
 # 0.4 5 0.3 True 0.8595407667322156
 # 0.35 20 0.4 True 0.8594469888605343
 # 0.5 3 0.1 True 0.8593139894651725
 # 0.3 3 0.2 True 0.8583072642147459
 # 0.3 3 0.3 True 0.8578628030262498
 # 0.3 20 0.1 True 0.8575444219737729
 # 0.6 10 0.1 True 0.8561717370019466
 # 0.3 10 0.5 True 0.854487667351149
 # 0.3 5 0.5 True 0.8543867058638921
 

Fitting with 20 trees
7 20 0.1 True 0.8540294838976275
Fitting with 20 trees
7 20 0.2 True 0.8575754782872846
Fitting with 20 trees
Fitting with 20 trees
Fitting with 20 trees
Fitting with 20 trees


In [None]:
best_model=[]
best_score=0

for thr in [0.2,0.4,0.6,0.8]:
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing, threshold=thr)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing, threshold=thr)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    model=SoftMarginSVMQP(C=1e9, kernel='linear')
    model.fit(train_dict["X"], train_dict["y"])
    val_results = model.predict(val_dict['X'])
    val_score_model = val_score(val_dict['y'], val_results)
    if val_score_model > best_score:
        best_score = val_score_model
        best_model = [thr,best_score]
        print(best_model[0],best_model[1])
    
    

In [None]:
best_model=[]
best_score=0

for thr in [0.2,0.4,0.6,0.8]:
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing, threshold=thr)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing, threshold=thr)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    for gama in [1e-2,1e-1,1,1e1]:
    
        model=SoftMarginSVMQP(C=1e9, kernel='rbf',gamma=gama)
        model.fit(train_dict["X"], train_dict["y"])
        val_results = model.predict(val_dict['X'])
        val_score_model = val_score(val_dict['y'], val_results)
        if val_score_model > best_score:
            best_score = val_score_model
            best_model = [thr,best_score,gama]
            print(best_model[0],best_model[1],best_model[2])
    
    

In [None]:
best_model=[]
best_score=0

for thr in [0.2,0.4,0.6,0.8]:
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing, threshold=thr)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing, threshold=thr)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    for c in [1e-2,1e-1,1,1e1,1e2]:
    
        model=SoftMarginSVMQP(C=c, kernel='linear')
        model.fit(train_dict["X"], train_dict["y"])
        val_results = model.predict(val_dict['X'])
        val_score_model = val_score(val_dict['y'], val_results)
        if val_score_model > best_score:
            best_score = val_score_model
            best_model = [thr,best_score,c]
            print(best_model[0],best_model[1],best_model[c])
    
    

In [None]:
best_model=[]
best_score=0

for thr in [0.2,0.4,0.6,0.8]:
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing, threshold=thr)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing, threshold=thr)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    for c in [1e-2,1e-1,1,1e1,1e2]:
        for gama in [1e-2,1e-1,1,1e1]:
    
            model=SoftMarginSVMQP(C=c, kernel='linear')
            model.fit(train_dict["X"], train_dict["y"])
            val_results = model.predict(val_dict['X'])
            val_score_model = val_score(val_dict['y'], val_results)
            if val_score_model > best_score:
                best_score = val_score_model
                best_model = [thr,best_score,c,gama]
                print(best_model[0],best_model[1],best_model[2],best_model[3])
    
    

In [4]:
if __name__ == '__main__':
    # Preprocess the datasets
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    model=AdaBoostClassifier(num_trees=36)
    model.fit(train_dict["X"], train_dict["y"])
    val_results = model.predict(val_dict['X'])
    wrong=[]
    for i in range(len(val_results)):
        if val_results[i]!=train_dict["y"][i]:
             wrong.append(i)
        if(len(wrong)==4):
            break
    

Fitting with 36 trees


In [5]:
print(wrong)

[1051, 1200, 1201, 1202]


In [4]:
pre_config= PRE_PROCESSING_CONFIG
config_name= "RandomForest"
required_config_for_preprocessing = pre_config[config_name]

if __name__ == '__main__':
    # Preprocess the datasets
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    model=RandomForestClassifier(num_trees=20,bootstrap_fraction=0.3,min_samples_split=7,max_depth=20,min_gain=0.2, weighted_forest=True)
    model.fit(train_dict["X"], train_dict["y"])
    val_results = model.predict(val_dict['X'])
    val_score_model,accuracy = val_score(val_dict['y'], val_results)
    
    wrong=[]
    for i in range(len(val_results)):
        if val_results[i]!=train_dict["y"][i]:
             wrong.append(i)
        if(len(wrong)==4):
            break
    print(wrong)

Fitting with 20 trees
[1200, 1201, 1202, 1203]


In [None]:
pre_config= PRE_PROCESSING_CONFIG
config_name= "hard_margin_linear"
required_config_for_preprocessing = pre_config[config_name]

if __name__ == '__main__':
    # Preprocess the datasets
    train_processor = MNISTPreprocessor('./dataset/train', required_config_for_preprocessing)
    train_X, train_y = train_processor.get_all_data()
    train_X, train_y = filter_dataset(train_X, train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_y = convert_labels_to_svm_labels(train_y, ENTRY_NUMBER_LAST_DIGIT)
    train_dict = {"X": train_X, "y": train_y}

    val_processor = MNISTPreprocessor('./dataset/val', required_config_for_preprocessing)
    val_X, val_y = val_processor.get_all_data()
    val_X, val_y = filter_dataset(val_X, val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_y = convert_labels_to_svm_labels(val_y, ENTRY_NUMBER_LAST_DIGIT)
    val_dict = {"X": val_X, "y": val_y}
    
    model=RandomForestClassifier(num_trees=20,bootstrap_fraction=0.3,min_samples_split=7,max_depth=20,min_gain=0.2, weighted_forest=True)
    model.fit(train_dict["X"], train_dict["y"])
    val_results = model.predict(val_dict['X'])
    wrong=[]
    for i in range(len(val_results)):
        if val_results[i]!=train_dict["y"][i]:
             wrong.append(i)
        if(len(wrong)==4):
            break
    print(wrong)