In [1]:
import numpy
from urllib.request import urlopen
import scipy.optimize
import random
from sklearn import svm
from sklearn import linear_model
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [2]:
def parseDataFromFile(fname):
    for l in open(fname):
        yield eval(l)
data = list(parseDataFromFile('beer_50000.json'))

#shuffling data
random.shuffle(data)

X_train, y_test = train_test_split(data, test_size=0.5, random_state=42)


categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]
catID = dict(zip(list(categories),range(len(categories))))

1.Train a logistic regressor using this one-hot encoding to predict whether beers have an ABV greater than 7 percent (i.e., d[’beer/ABV’] > 7). Train the classifier on the training set and report its performance in terms of the accuracy and Balanced Error Rate (BER) on the test set, using a regularization constant of C = 10. For all experiments use the class weight=’balanced’ option (2 marks).

In [3]:
#one-hit-encoding
one_hot_encoded_catID = []
for x in catID:
    vector = [0 for y in range(len(catID))]
    vector[catID[x]] = 1
    one_hot_encoded_catID.append(vector)

one_hot_encoded = []
true_answer = []
for d in X_train:
    if d['beer/style'] in categories:
        one_hot_encoded.append(one_hot_encoded_catID[catID[d['beer/style']]])
        if d['beer/ABV'] > 7:
            true_answer.append('true')
        else:
            true_answer.append('false')
    else:
        vector = [0 for y in range(len(catID))]
        one_hot_encoded.append(vector)
        if d['beer/ABV'] > 7:
            true_answer.append('true')
        else:
            true_answer.append('false')

#train model
mod = linear_model.LogisticRegression(C=10.0, fit_intercept = True, class_weight='balanced')
mod.fit(one_hot_encoded, true_answer)

#predict test set
one_hot_encoded_test = []
for d in y_test:
    if d['beer/style'] in categories:
        one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
    else:
        vector = [0 for y in range(len(catID))]
        one_hot_encoded_test.append(vector)
predict = mod.predict(one_hot_encoded_test)


#true output
true_output = []
for d in y_test:
    if d['beer/ABV'] > 7:
        true_output.append('true')
    else:
        true_output.append('false')
        
        
        
#True Positive(TP)
TP_count = 0
for i in range(len(predict)):
    if predict[i] == 'true'and true_output[i] == 'true':
        TP_count = TP_count + 1
#True Negative(TN)
TN_count = 0
for i in range(len(predict)):
    if predict[i] == 'false' and true_output[i] == 'false':
        TN_count = TN_count + 1
#False Positive(FP)
FP_count = 0
for i in range(len(predict)):
    if predict[i] == 'true' and true_output[i] == 'false':
        FP_count = FP_count + 1
#False Negative(TN)
FN_count = 0
for i in range(len(predict)):
    if predict[i] == 'false' and true_output[i] == 'true':
        FN_count = FN_count + 1

#Classification Accuracy
classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
print('classification accuracy: ', classification_accuracy)

#Flase Positive Rate(FPR)
FPR = FP_count / (FP_count + TN_count)
#False Negative Rate(FNR)
FNR = FN_count / (FN_count + TP_count)
#Balanced Error Rate(BER)
BER = (FPR + FNR) / 2
print('Balanced Error Rate: ', BER)

classification accuracy:  0.849
Balanced Error Rate:  0.1625314568615568


2. Extend your model to include two additional features: (1) a vector of five ratings (review/aroma, review/overall, etc.); and (2) the review length (in characters). The length feature should be scaled to be between 0 and 1 by dividing by the maximum length. Using the same value of C from the previous question, report the BER of the new classifier (1 mark).

In [4]:
appearance = [ d['review/appearance'] for d in X_train]
overall = [ d['review/overall'] for d in X_train]
aroma = [ d['review/aroma'] for d in X_train]
taste = [ d['review/taste'] for d in X_train]
palate = [ d['review/palate'] for d in X_train]


#review length
review_length = [ len(d['review/text']) for d in X_train]
#max review length
max_length = max(review_length)
#normalized
norm_review_length = [ d / max_length for d in review_length]


additional_feature = []
feat_vector = []
for i in range(len(X_train)):
    feat_vector.append(appearance[i])
    feat_vector.append(overall[i])
    feat_vector.append(aroma[i])
    feat_vector.append(taste[i])
    feat_vector.append(palate[i])
    feat_vector.append(norm_review_length[i])
    additional_feature.append(feat_vector)
    feat_vector =[]   

#combine feartures
new_feature = []
for i in range(len(X_train)):
    new_feature.append(one_hot_encoded[i] + additional_feature[i])
    
#train model
mod = linear_model.LogisticRegression(C=10.0, fit_intercept = True, class_weight='balanced', max_iter=1000)
mod.fit(new_feature, true_answer)

LogisticRegression(C=10.0, class_weight='balanced', max_iter=1000)

In [5]:
one_hot_encoded_test = []

for d in y_test:
    if d['beer/style'] in categories:
        one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
    else:
        vector = [0 for y in range(len(catID))]
        one_hot_encoded_test.append(vector)
        
        
appearance = [ d['review/appearance'] for d in y_test]
overall = [ d['review/overall'] for d in y_test]
aroma = [ d['review/aroma'] for d in y_test]
taste = [ d['review/taste'] for d in y_test]
palate = [ d['review/palate'] for d in y_test]


#review length
review_length_test = [ len(d['review/text']) for d in y_test]
#max review length
max_length = max(review_length_test)
#normalized
norm_review_length = [ d / max_length for d in review_length_test]


additional_feature = []
feat_vector = []
for i in range(len(X_train)):
    feat_vector.append(appearance[i])
    feat_vector.append(overall[i])
    feat_vector.append(aroma[i])
    feat_vector.append(taste[i])
    feat_vector.append(palate[i])
    feat_vector.append(norm_review_length[i])
    additional_feature.append(feat_vector)
    feat_vector =[]   

#combine feartures
new_feature_test = []
for i in range(len(y_test)):
    new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])
    
    
#predict
predict = mod.predict(new_feature_test)


true_output = []
for d in y_test:
    if d['beer/ABV'] > 7:
        true_output.append('true')
    else:
        true_output.append('false')
        
        
#True Positive(TP)
TP_count = 0
for i in range(len(predict)):
    if predict[i] == 'true'and true_output[i] == 'true':
        TP_count = TP_count + 1


#True Negative(TN)
TN_count = 0
for i in range(len(predict)):
    if predict[i] == 'false' and true_output[i] == 'false':
        TN_count = TN_count + 1


#False Positive(FP)
FP_count = 0
for i in range(len(predict)):
    if predict[i] == 'true' and true_output[i] == 'false':
        FP_count = FP_count + 1


#False Negative(TN)
FN_count = 0
for i in range(len(predict)):
    if predict[i] == 'false' and true_output[i] == 'true':
        FN_count = FN_count + 1

#Classification Accuracy
classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
print('classification accuracy: ', classification_accuracy)

#Flase Positive Rate(FPR)
FPR = FP_count / (FP_count + TN_count)

#False Negative Rate(FNR)
FNR = FN_count / (FN_count + TP_count)

#Balanced Error Rate(BER)
BER = (FPR + FNR) / 2
print('Balanced Error Rate: ', BER)

classification accuracy:  0.86004
Balanced Error Rate:  0.14569674331617521


3. Implement a complete regularization pipeline with the balanced classifier. Split your test data from above in half so that you have 50%/25%/25% train/validation/test fractions. Consider values of C in the range {10^−6, 10^−5, 10^−4, 10^−3}. Report (or plot) the train, validation, and test BER for each value of C. Based on these values, which classifier would you select (in terms of generalization performance) and why (1 mark)?

In [6]:
indices = [d for d in range(len(new_feature_test))]
index = random.shuffle(indices)
val_data = []
test_data = []

test_indices = indices[:12500]  
val_indices = indices[12500:] 

test_data = [ new_feature_test[i] for i in test_indices]
test_true_output = [ true_output[i] for i in test_indices]
val_data = [ new_feature_test[i] for i in val_indices]
val_true_output = [ true_output[i] for i in val_indices]

In [7]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(new_feature, true_answer)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)



    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1
   


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

['false' 'false' 'false' ... 'true' 'true' 'false']
classification accuracy:  0.67744
Balanced Error Rate:  0.3173496784251614
['false' 'false' 'false' ... 'true' 'true' 'false']
classification accuracy:  0.67952
Balanced Error Rate:  0.3153626595470173
['false' 'false' 'false' ... 'true' 'true' 'false']
classification accuracy:  0.70184
Balanced Error Rate:  0.29397579545249913
['false' 'false' 'false' ... 'true' 'true' 'false']
classification accuracy:  0.80464
Balanced Error Rate:  0.19717741582680248


In [36]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
    #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(test_data, test_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
  

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
  

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.67696
Balanced Error Rate:  0.31818149161416154
classification accuracy:  0.67764
Balanced Error Rate:  0.31742726504617924
classification accuracy:  0.68784
Balanced Error Rate:  0.30759283560239353
classification accuracy:  0.76208
Balanced Error Rate:  0.2368807976755498


In [37]:
c_list = [0.000001, 0.00001, 0.0001, 0.001] 
for c in c_list:
    #train model
    mod = linear_model.LogisticRegression(C = c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(val_data, val_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
 


    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1


    
    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
            
            
    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1

            
    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    
    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)
    

classification accuracy:  0.67736
Balanced Error Rate:  0.3180571555835834
classification accuracy:  0.67832
Balanced Error Rate:  0.3170526972065056
classification accuracy:  0.69056
Balanced Error Rate:  0.30517658343793774
classification accuracy:  0.76704
Balanced Error Rate:  0.23239469246012506


The classifier which fit 50% train set and 50% true set is better since the accuracy is higher and BER is less when c is higher.

4. (CSE158 only) An ablation study measures the marginal benefit of various features by re-training the model with one feature ‘ablated’ (i.e., deleted) at a time. Considering each of the three features in your classifier above (i.e., beer style, ratings, and length), report the BER with only the other two features and the third deleted (1 mark).

In [38]:
#remove style with training set
new_feature_no_style = [ d[13:] for d in new_feature]

#remove style with test set
test_data_no_style = [ d[13:] for d in test_data]

#remove style with val set
val_data_no_style = [ d[13:] for d in val_data]


#remove length with training set
new_feature_no_length = [ d[0:18] for d in new_feature]

#remove length with test set
test_data_no_length = [ d[0:18] for d in test_data]

#remove length with val set
val_data_no_length = [ d[0:18] for d in val_data]


In [39]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(new_feature_no_style, true_answer)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1
   


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)
    
    

classification accuracy:  0.65504
Balanced Error Rate:  0.3398369290926173
classification accuracy:  0.6554
Balanced Error Rate:  0.3394496324233687
classification accuracy:  0.6604
Balanced Error Rate:  0.33472216505826424
classification accuracy:  0.67424
Balanced Error Rate:  0.3222523758482609


In [40]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(test_data_no_style, test_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1
   


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.65544
Balanced Error Rate:  0.3393272677855469
classification accuracy:  0.65568
Balanced Error Rate:  0.33910306929515
classification accuracy:  0.65684
Balanced Error Rate:  0.3379174453909252
classification accuracy:  0.66728
Balanced Error Rate:  0.32807981283590737


In [41]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
    #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(val_data_no_style, val_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
  

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
  

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.65504
Balanced Error Rate:  0.33979159670714765
classification accuracy:  0.6554
Balanced Error Rate:  0.33945529897155235
classification accuracy:  0.65676
Balanced Error Rate:  0.33813384192565027
classification accuracy:  0.67032
Balanced Error Rate:  0.32589161833200764


In [42]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(new_feature_no_length, true_answer)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        #feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)


    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1
   


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.67716
Balanced Error Rate:  0.3181306566952399
classification accuracy:  0.67864
Balanced Error Rate:  0.31665743456685336
classification accuracy:  0.70048
Balanced Error Rate:  0.29557538615869644
classification accuracy:  0.80484
Balanced Error Rate:  0.19687376793982686


In [43]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(test_data_no_length, test_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        #feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])

    #predict
    predict = mod.predict(new_feature_test)

    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')

    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1
   


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.6764
Balanced Error Rate:  0.3186649555878015
classification accuracy:  0.67712
Balanced Error Rate:  0.3179243615384065
classification accuracy:  0.68768
Balanced Error Rate:  0.3077536343590255
classification accuracy:  0.76184
Balanced Error Rate:  0.23712199581049775


In [44]:
c_list = [0.000001, 0.00001, 0.0001, 0.001]
for c in c_list:
        #train model
    mod = linear_model.LogisticRegression(C=c, fit_intercept = True, class_weight='balanced', max_iter=1000)
    mod.fit(val_data_no_length, val_true_output)

    one_hot_encoded_test = []

    for d in y_test:
        if d['beer/style'] in categories:
            one_hot_encoded_test.append(one_hot_encoded_catID[catID[d['beer/style']]])
        else:
            vector = [0 for y in range(len(catID))]
            one_hot_encoded_test.append(vector)


    appearance = [ d['review/appearance'] for d in y_test]
    overall = [ d['review/overall'] for d in y_test]
    aroma = [ d['review/aroma'] for d in y_test]
    taste = [ d['review/taste'] for d in y_test]
    palate = [ d['review/palate'] for d in y_test]


    #review length
    review_length_test = [ len(d['review/text']) for d in y_test]
    #max review length
    max_length = max(review_length_test)
    #normalized
    norm_review_length = [ d / max_length for d in review_length_test]


    additional_feature = []
    feat_vector = []
    for i in range(len(X_train)):
        feat_vector.append(appearance[i])
        feat_vector.append(overall[i])
        feat_vector.append(aroma[i])
        feat_vector.append(taste[i])
        feat_vector.append(palate[i])
        #feat_vector.append(norm_review_length[i])
        additional_feature.append(feat_vector)
        feat_vector =[]   

    #combine feartures
    new_feature_test = []

    for i in range(len(y_test)):
        new_feature_test.append(one_hot_encoded_test[i] + additional_feature[i])


    #predict
    predict = mod.predict(new_feature_test)

    true_output = []
    for d in y_test:
        if d['beer/ABV'] > 7:
            true_output.append('true')
        else:
            true_output.append('false')


    #True Positive(TP)
    TP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true'and true_output[i] == 'true':
            TP_count = TP_count + 1
   

    #True Negative(TN)
    TN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'false':
            TN_count = TN_count + 1
    

    #False Positive(FP)
    FP_count = 0
    for i in range(len(predict)):
        if predict[i] == 'true' and true_output[i] == 'false':
            FP_count = FP_count + 1
   

    #False Negative(TN)
    FN_count = 0
    for i in range(len(predict)):
        if predict[i] == 'false' and true_output[i] == 'true':
            FN_count = FN_count + 1


    #Classification Accuracy
    classification_accuracy = (TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)
    print('classification accuracy: ', classification_accuracy)

    #Flase Positive Rate(FPR)
    FPR = FP_count / (FP_count + TN_count)

    #False Negative Rate(FNR)
    FNR = FN_count / (FN_count + TP_count)

    #Balanced Error Rate(BER)
    BER = (FPR + FNR) / 2
    print('Balanced Error Rate: ', BER)

classification accuracy:  0.67784
Balanced Error Rate:  0.31751809383185037
classification accuracy:  0.67816
Balanced Error Rate:  0.3172134959631376
classification accuracy:  0.69056
Balanced Error Rate:  0.3051935830824889
classification accuracy:  0.76484
Balanced Error Rate:  0.23432518122872106
