In [1]:
import numpy as np
import pandas as pd
import copy
import sklearn.metrics
from sklearn.tree import DecisionTreeClassifier

In [20]:
def precision_recall_f1(preds, truth):
    tp, fp, tn, fn = 0, 0, 0, 0
    for j in range(len(preds)):
        if truth[j] == 1 and preds[j] == 1:
            tp += 1
        elif truth[j] == 1:
            fn += 1
        elif preds[j] == 1:
            fp += 1
        else:
            tn += 1
    if tp + fp > 0:
        precision = tp / (tp + fp)
    else:
        precision = 0
    recall = tp / (tp + fn)
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else: 
        f1 = 0
    return (precision, recall, f1)

In [2]:
df = pd.read_csv('data/YearPredictionMSD.txt', header=None)

In [3]:
for year in range(1922, 2011):
    #print("Songs by year " + str(year) + ": " + str(len(df[df[0] <= year])))
    print("Proportion by year " + str(year) + ": " + str(len(df[df[0] <= year])/len(df)))

Proportion by year 1922: 1.1642685967652738e-05
Proportion by year 1923: 1.1642685967652738e-05
Proportion by year 1924: 2.1344924274030017e-05
Proportion by year 1925: 3.4928057902958214e-05
Proportion by year 1926: 7.179656346719188e-05
Proportion by year 1927: 0.00015329536524076105
Proportion by year 1928: 0.00025419864362708476
Proportion by year 1929: 0.0004346602761257022
Proportion by year 1930: 0.0005122781825767205
Proportion by year 1931: 0.0005801938507213614
Proportion by year 1932: 0.0006015387749953914
Proportion by year 1933: 0.0006131814609630442
Proportion by year 1934: 0.0006694544431400324
Proportion by year 1935: 0.0007160251870106433
Proportion by year 1936: 0.0007645363785425298
Proportion by year 1937: 0.0008188689130582426
Proportion by year 1938: 0.0008557374186224762
Proportion by year 1939: 0.0009236530867671171
Proportion by year 1940: 0.001024556365153441
Proportion by year 1941: 0.0010866506903142554
Proportion by year 1942: 0.0011332214341848665
Proporti

In [4]:
year = df[0]
pd.set_option('display.max_rows', None)
print(year.value_counts().sort_index())
pd.reset_option('display.max_rows')

1922        6
1924        5
1925        7
1926       19
1927       42
1928       52
1929       93
1930       40
1931       35
1932       11
1933        6
1934       29
1935       24
1936       25
1937       28
1938       19
1939       35
1940       52
1941       32
1942       24
1943       14
1944       15
1945       30
1946       29
1947       57
1948       43
1949       60
1950       83
1951       74
1952       77
1953      133
1954      123
1955      275
1956      565
1957      597
1958      583
1959      592
1960      424
1961      571
1962      605
1963      902
1964      945
1965     1120
1966     1377
1967     1718
1968     1867
1969     2210
1970     2349
1971     2131
1972     2288
1973     2596
1974     2184
1975     2482
1976     2179
1977     2502
1978     2926
1979     3108
1980     3101
1981     3162
1982     3597
1983     3386
1984     3368
1985     3578
1986     4219
1987     5122
1988     5611
1989     6670
1990     7256
1991     8647
1992     9543
1993    10525
1994  

In [5]:
df['before_1989'] = df[0] <= 1989
df['before_1989'] = df['before_1989'].astype(int)

In [6]:
df['before_1989'].value_counts()

0    433113
1     82232
Name: before_1989, dtype: int64

In [7]:
target_feature = 'before_1989'

In [8]:
df.loc[df[target_feature] == 0, target_feature] = -1

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,before_1989
0,2001,49.94357,21.47114,73.07750,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327,-1
1,2001,48.73215,18.42930,70.32679,12.94636,-10.32437,-24.83777,8.76630,-0.92019,18.76548,...,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061,-1
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.54940,-3.27872,-2.35035,16.07017,...,26.05866,-50.92779,10.93792,-0.07568,43.20130,-115.00698,-0.05859,39.67068,-0.66345,-1
3,2001,48.24750,-1.89837,36.29772,2.58776,0.97170,-26.21683,5.05097,-10.34124,3.55005,...,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382,-1
4,2001,50.97020,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515340,2006,51.28467,45.88068,22.19582,-5.53319,-3.61835,-16.36914,2.12652,5.18160,-8.66890,...,-3.75991,-30.92584,26.33968,-5.03390,21.86037,-142.29410,3.42901,-41.14721,-15.46052,-1
515341,2006,49.87870,37.93125,18.65987,-3.63581,-27.75665,-18.52988,7.76108,3.56109,-2.50351,...,-32.75535,-61.05473,56.65182,15.29965,95.88193,-10.63242,12.96552,92.11633,10.88815,-1
515342,2006,45.12852,12.65758,-38.72018,8.80882,-29.29985,-2.28706,-18.40424,-22.28726,-4.52429,...,-71.15954,-123.98443,121.26989,10.89629,34.62409,-248.61020,-6.07171,53.96319,-8.09364,-1
515343,2006,44.16614,32.38368,-3.34971,-2.49165,-19.59278,-18.67098,8.78428,4.02039,-12.01230,...,282.77624,-4.63677,144.00125,21.62652,-29.72432,71.47198,20.32240,14.83107,39.74909,-1


In [10]:
X = df.drop([0, target_feature], axis = 1) # dropping year attribute (determines label we want to classify)
y = df[target_feature]

In [11]:
X_train, X_test = X[:463715], X[463715:]
y_train, y_test = y[:463715], y[463715:]

In [12]:
#converting to np arrays to be able to use training functions
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [13]:
# Metric: choice of evaluation metric (f1, precision, recall, etc.)
# Proportion: proportion of test set to predict as 1s, if needed (logistic regression may predict all 0 by default)
def my_cross_val_imbalanced(model, metric, proportion, X, y, k=10):
    (n, d) = X.shape
    validation_metrics = np.zeros(k)
    for i in range(k):
        val_set = X[round(i*n/k):round((i+1)*n/k), :]
        val_labels = y[round(i*n/k):round((i+1)*n/k)]
        train_set = np.delete(X, [j for j in range(round(i*n/k), round((i+1)*n/k))], 0)
        train_labels = np.delete(y, [j for j in range(round(i*n/k), round((i+1)*n/k))], 0)
        model.fit(train_set, train_labels)
        if proportion == None:
            y_preds = model.predict(val_set)
        else:
            y_preds = model.predict_proportion(val_set, proportion)
        
        tp, fp, tn, fn = 0, 0, 0, 0
        score = 0
        for j in range(len(y_preds)):
            if val_labels[j] == 1 and y_preds[j] == 1:
                tp += 1
            elif val_labels[j] == 1:
                fn += 1
            elif y_preds[j] == 1:
                fp += 1
            else:
                tn += 1
        if tp == 0: # to avoid division by zero error for trivial models
            precision = 0
            recall = 0
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)  
        if metric == 'precision':
            score = precision
        if metric == 'recall':
            score = recall
        if metric == 'f1':
            if precision + recall == 0:
                score = 0
            else:
                score = 2 * precision * recall / (precision + recall)
        if metric == 'auprc':
            score = sklearn.metrics.average_precision_score(val_labels, y_preds)
        validation_metrics[i] = score
    return validation_metrics

In [22]:
class MyWeightedAdaboost:
    
    def __init__(self, estimator, num_estimators, w1):
        
        self.estimator=estimator # just declaring the type of estimator! do not change it but make copies
        self.num_estimators=num_estimators
        self.classifiers = []
        self.alphas = []
        self.w1 = w1
        
    def fit(self, X, y): # y labeled in 1 and -1 (make sure to do that before!)
        
        (n, d) = X.shape
        weights = np.array([(1/n) for i in range(n)])
        adaboost_predictions = np.zeros(n)
        
        # Initializing extra weight for ones
        one_indices = np.where(y==1)
        weights[one_indices] *= self.w1
        weights = weights/np.sum(weights)
        
        for t in range(self.num_estimators):
            
            #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=t)
            estimator = copy.deepcopy(self.estimator)
            
            #generating dataset to fit from weight distribution
            samples = np.random.choice(np.array(range(n)), size=n, replace=True, p=weights)
            X_samp = X[samples]
            y_samp = y[samples]
            
            estimator.fit(X_samp, y_samp)
            print(np.argmax(estimator.feature_importances_)) # prints most important column, also add in dataframe to figure out what it is???
            predictions = estimator.predict(X) #do you just do the training set here?
            
            error = 0
            for i in range(n):
                if predictions[i] != y[i]: # introduce parameter here to add extra class weight for when y = 1?
                    error += weights[i] 
            #error = error / (np.sum(weights))

            alpha = (1/2)*np.log((1 - error)/error)
            
            for i in range(n):
                if predictions[i] != y[i]: #same here introduce some weight...
                    weights[i] *= np.exp(alpha)
                else:
                    weights[i] *= np.exp(-alpha)
            
            #Normalize weights
            weights = weights/np.sum(weights)
            
            self.classifiers.append(estimator)
            self.alphas.append(alpha)
            
            # Add weak classifier weights 
            #self.classifiers.append(alpha*predictions)
        #print(self.classifiers)
        #print(self.alphas)
        
    def predict(self, X):
        (n, d) = X.shape
        predictions = np.zeros(n)
        for t in range(self.num_estimators):
            predictions += self.alphas[t] * self.classifiers[t].predict(X)
        print(predictions)
        return np.sign(predictions)
    
    def predict_values(self, X):
        (n, d) = X.shape
        predictions = np.zeros(n)
        for t in range(self.num_estimators):
            predictions += self.alphas[t] * self.classifiers[t].predict(X)
        #print(predictions)
        return predictions

In [18]:
#SVM
eta_vals = [0.00001, 0.0001, 0.001]
C_vals = [1, 10, 100]
weights = [1, 2, 5, 10, 20, 50]
best_score = 0
best_eta, best_c, best_weight = (0, 0, 0)

(_, num_features) = X_train.shape
proportion = len(y_train[y_train == 1])/len(y_train)

for eta_val in eta_vals:
    for c_val in C_vals:
        for w in weights:

            # instantiate svm object
            svm = MyWeightedSVM(num_features, 100000, eta_val, c_val, w)

            # call to your CV function to compute error rates for each fold
            #cv_scores = my_cross_val_imbalanced(svm, 'f1', proportion, X_train, y_train, k=10)
            cv_scores = my_cross_val_imbalanced(svm, 'f1', None, X_train, y_train, k=10)

            # print error rates from CV
            print("Eta: " + str(eta_val))
            print("C: " + str(c_val))
            print("Weight: " + str(w))
            for i in range(10):
                print("F1 score for fold " + str(i) + ": " + str(cv_scores[i]))
            mean_score = sum(cv_scores)/len(cv_scores)
            print("Mean validation F1 score: " + str(mean_score))
            print("Validation F1 score stdev: " + str(np.std(cv_scores)))
            if mean_score >= best_score:
                best_score = mean_score
                best_eta, best_c, best_weight = (eta_val, c_val, w)

# instantiate svm object for best value of eta and C
print("Best eta value: " + str(best_eta))
print("Best C value: " + str(best_c))
best_svm = MyWeightedSVM(num_features, 100000, best_eta, best_c, best_weight)

# fit model using all training data
best_svm.fit(X_train, y_train)

# predict on test data
#y_preds = best_svm.predict_proportion(X_test, proportion)
y_preds = best_svm.predict(X_test)

# compute F1 score on test data
(precision, recall, f1) = precision_recall_f1(y_preds, y_test)
auprc = sklearn.metrics.average_precision_score(y_test, y_preds)

print("Test precision: " + str(precision))
print("Test recall: " + str(recall))
print("Test F1 score: " + str(f1))
print("Test AUPRC score: " + str(auprc)) 

Eta: 1e-05
C: 1
Weight: 1
F1 score for fold 0: 0.20201207243460761
F1 score for fold 1: 0.22524420609078719
F1 score for fold 2: 0.2101028752308098
F1 score for fold 3: 0.22360162769846195
F1 score for fold 4: 0.20371757336419966
F1 score for fold 5: 0.2064295575570311
F1 score for fold 6: 0.20394307350829086
F1 score for fold 7: 0.20428500249128048
F1 score for fold 8: 0.20043202199384694
F1 score for fold 9: 0.19704301075268815
Mean validation F1 score: 0.2076811021122004
Validation F1 score stdev: 0.008991268339634869
Eta: 1e-05
C: 1
Weight: 2
F1 score for fold 0: 0.16549678695007414
F1 score for fold 1: 0.1658905192865302
F1 score for fold 2: 0.1602910990856503
F1 score for fold 3: 0.17426485099664496
F1 score for fold 4: 0.18631216149679958
F1 score for fold 5: 0.1763622974963181
F1 score for fold 6: 0.1499287410926366
F1 score for fold 7: 0.16834666943780524
F1 score for fold 8: 0.16430441455267672
F1 score for fold 9: 0.17441096404768608
Mean validation F1 score: 0.1685608504442

Eta: 1e-05
C: 100
Weight: 20
F1 score for fold 0: 0.27878689147728813
F1 score for fold 1: 0.3146337718175769
F1 score for fold 2: 0.2920981213606303
F1 score for fold 3: 0.2760309895902665
F1 score for fold 4: 0.27747400021443125
F1 score for fold 5: 0.2998530646515533
F1 score for fold 6: 0.2942698706099815
F1 score for fold 7: 0.26549100782279095
F1 score for fold 8: 0.3102055704645528
F1 score for fold 9: 0.2896915370419193
Mean validation F1 score: 0.28985348250509907
Validation F1 score stdev: 0.014876637662554561
Eta: 1e-05
C: 100
Weight: 50
F1 score for fold 0: 0.27390204435125465
F1 score for fold 1: 0.31277350808821924
F1 score for fold 2: 0.2859498239828885
F1 score for fold 3: 0.2727681380922413
F1 score for fold 4: 0.2732710958034986
F1 score for fold 5: 0.29443407130790433
F1 score for fold 6: 0.29260679568358156
F1 score for fold 7: 0.2618242481626764
F1 score for fold 8: 0.30487643716135854
F1 score for fold 9: 0.28684280996161743
Mean validation F1 score: 0.28592489725

Eta: 0.0001
C: 100
Weight: 5
F1 score for fold 0: 0.23726622027245442
F1 score for fold 1: 0.2662045950455654
F1 score for fold 2: 0.24960056808095152
F1 score for fold 3: 0.2331187508560471
F1 score for fold 4: 0.2400697984111677
F1 score for fold 5: 0.2498676081200353
F1 score for fold 6: 0.24173175630439636
F1 score for fold 7: 0.22397696130800315
F1 score for fold 8: 0.26121908127208476
F1 score for fold 9: 0.24636844209497577
Mean validation F1 score: 0.24494237817656814
Validation F1 score stdev: 0.011988843169201128
Eta: 0.0001
C: 100
Weight: 10
F1 score for fold 0: 0.25669650454375276
F1 score for fold 1: 0.29153405474220245
F1 score for fold 2: 0.2701526491048762
F1 score for fold 3: 0.25396403284246916
F1 score for fold 4: 0.2569531065637326
F1 score for fold 5: 0.27618268128990014
F1 score for fold 6: 0.2696378097679616
F1 score for fold 7: 0.24580930216870717
F1 score for fold 8: 0.2891126535666569
F1 score for fold 9: 0.2682277107245042
Mean validation F1 score: 0.26782705

Eta: 0.001
C: 100
Weight: 1
F1 score for fold 0: 0.13819261213720319
F1 score for fold 1: 0.13588339631371943
F1 score for fold 2: 0.1353528843055108
F1 score for fold 3: 0.15016778523489932
F1 score for fold 4: 0.12811151676070362
F1 score for fold 5: 0.14293602587362939
F1 score for fold 6: 0.12179539021431458
F1 score for fold 7: 0.12818464269862406
F1 score for fold 8: 0.13689058281721456
F1 score for fold 9: 0.1309659553673237
Mean validation F1 score: 0.13484807917231428
Validation F1 score stdev: 0.007694276706889087
Eta: 0.001
C: 100
Weight: 2
F1 score for fold 0: 0.015136920324755745
F1 score for fold 1: 0.011104548139397518
F1 score for fold 2: 0.008997429305912597
F1 score for fold 3: 0.011219045012997673
F1 score for fold 4: 0.012967305835287626
F1 score for fold 5: 0.011630847029077115
F1 score for fold 6: 0.007930416986441546
F1 score for fold 7: 0.015203671830177852
F1 score for fold 8: 0.009764433052605883
F1 score for fold 9: 0.014349073832507175
Mean validation F1 sco

NameError: name 'MySVM' is not defined

In [19]:
w1_vals = [1, 2, 5, 10, 20, 50]

best_score = 0
best_weight = 0

for w in w1_vals:

    adaboost = MyWeightedAdaboost(DecisionTreeClassifier(max_depth=1), 50, w)

    cv_scores = my_cross_val_imbalanced(adaboost, 'f1', None, X_train, y_train, k=10)
    
    print("Weight: " + str(w))
    for i in range(10):
        print("F1 score for fold " + str(i) + ": " + str(cv_scores[i]))
    mean_score = sum(cv_scores)/len(cv_scores)
    print("Mean validation F1 score: " + str(mean_score))
    print("Validation F1 score stdev: " + str(np.std(cv_scores)))
    if mean_score >= best_score:
        best_score = mean_score
        best_weight = w

print("Best weight: " + str(best_weight))
best_adaboost = MyWeightedAdaboost(DecisionTreeClassifier(max_depth=1), 50, best_weight)

best_adaboost.fit(X_train, y_train)

y_preds = best_adaboost.predict(X_test)
#y_preds = best_adaboost.predict_proportion(X_test, proportion)

# compute F1 score on test data
(precision, recall, f1) = precision_recall_f1(y_preds, y_test)
auprc = sklearn.metrics.average_precision_score(y_test, y_preds)

print("Test precision: " + str(precision))
print("Test recall: " + str(recall))
print("Test F1 score: " + str(f1))
print("Test AUPRC score: " + str(auprc)) 

0
0
2
5
13
13
0
0
56
19
56
40
40
38
73
0
0
62
62
1
39
84
84
0
0
77
2
0
0
1
27
0
5
5
89
19
11
2
1
0
35
35
83
0
5
22
82
87
30
35
[-1.2781628  -1.09902983 -1.47393076 ... -1.58742231 -1.27099472
 -1.71309036]
0
0
2
5
13
13
0
56
19
40
73
0
0
19
19
62
62
38
1
0
39
39
84
5
38
0
2
1
5
77
84
0
0
27
27
35
89
35
0
1
0
0
19
19
11
83
68
30
5
22
[-1.82933222 -1.42492116 -1.66112943 ... -0.14927004 -0.67178517
 -0.30430882]
0
0
2
5
5
13
0
0
56
56
62
62
19
19
0
1
73
40
40
0
0
39
84
84
77
2
0
2
27
27
1
0
2
38
0
22
35
35
89
0
0
0
11
45
0
13
84
0
0
57
[ 0.37311424  0.08939389  0.07618956 ... -1.96438545 -1.62690732
 -1.70923001]
0
0
2
5
5
13
0
0
56
56
19
19
62
62
0
13
73
0
40
0
2
1
0
5
39
84
77
0
1
27
48
35
35
89
0
84
0
38
11
83
0
0
38
2
0
0
1
39
68
45
[-1.69321844 -1.67999812 -1.85362029 ... -0.67706805 -0.69277823
 -1.07845719]
0
0
2
5
5
19
56
56
62
0
13
0
40
73
73
1
0
0
39
0
0
84
2
2
77
0
0
13
0
1
27
27
0
0
84
35
0
89
19
5
19
19
35
35
83
68
57
57
0
38
[-1.17182063 -0.99287512 -1.18828077 ... -1.06204

0
39
89
48
2
89
38
38
84
35
19
35
45
0
0
11
1
0
0
68
0
[ 1.30294431  1.51816719  1.19528446 ... -0.87753103 -0.59412113
 -0.47835375]
0
2
13
5
56
62
0
1
62
73
0
19
0
0
40
84
39
84
0
0
77
1
0
27
2
0
89
35
2
19
5
56
35
35
45
0
1
1
83
84
57
57
0
38
0
68
0
0
11
1
[-0.23379431 -1.07888545 -1.00233432 ...  0.62504344  0.25454871
 -0.65133393]
0
2
13
0
56
40
5
5
1
62
73
39
39
0
0
84
19
77
19
0
1
0
0
2
27
35
35
0
89
5
0
5
22
2
19
57
38
83
84
5
57
1
5
68
45
0
0
0
2
1
[ 0.4236476   0.29134744 -0.13205879 ... -0.00065995  0.39609053
  0.46297731]
0
2
13
0
56
40
5
5
73
1
62
19
19
5
39
84
77
0
19
0
38
38
27
27
0
1
0
0
89
35
35
45
39
2
2
83
0
0
2
1
68
0
0
0
57
5
1
0
22
84
[ 0.31703559  0.41730871 -0.12806287 ...  0.84371349  0.33781316
 -0.02260021]
0
2
13
0
56
40
5
73
5
62
1
62
5
19
19
77
84
84
0
0
39
11
1
0
27
2
0
39
68
83
35
35
1
39
0
45
35
87
87
30
2
2
89
89
1
0
57
38
19
19
[ 1.00737847  0.58983723  0.48602656 ...  0.57456326 -0.26288875
  0.01079471]
0
2
13
0
56
40
5
5
73
1
62
62
0
0
39
84
19
8

NameError: name 'precision_recall_f1' is not defined

In [21]:
# Weight 5...
(precision, recall, f1) = precision_recall_f1(y_preds, y_test)
auprc = sklearn.metrics.average_precision_score(y_test, y_preds)

print("Test precision: " + str(precision))
print("Test recall: " + str(recall))
print("Test F1 score: " + str(f1))
print("Test AUPRC score: " + str(auprc)) 

Test precision: 0.316182842287695
Test recall: 0.7179045745204132
Test F1 score: 0.4390133854714995
Test AUPRC score: 0.2714206409182094


In [23]:
print("Best weight: " + str(best_weight))
best_adaboost = MyWeightedAdaboost(DecisionTreeClassifier(max_depth=1), 50, best_weight)

best_adaboost.fit(X_train, y_train)

y_preds = best_adaboost.predict(X_test)
#y_preds = best_adaboost.predict_proportion(X_test, proportion)
y_values = best_adaboost.predict_values(X_test)

# compute F1 score on test data
(precision, recall, f1) = precision_recall_f1(y_preds, y_test)
auprc = sklearn.metrics.average_precision_score(y_test, y_values)

print("Test precision: " + str(precision))
print("Test recall: " + str(recall))
print("Test F1 score: " + str(f1))
print("Test AUPRC score: " + str(auprc)) 

Best weight: 5
0
2
5
5
13
0
62
0
0
19
56
19
19
0
1
73
40
40
0
0
39
39
84
77
2
2
0
0
1
0
19
38
27
27
89
0
35
13
0
0
0
45
5
0
0
84
68
83
11
0
[ 0.63834925 -0.6702301  -0.11222252 ... -0.6315668  -0.41247379
 -0.5587478 ]
Test precision: 0.3234995487364621
Test recall: 0.7052385636989671
Test F1 score: 0.44354215003866976
Test AUPRC score: 0.38378946447779805
