In [1]:
# code for question 1

import arff
import numpy as np
from itertools import product
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import ttest_ind

seeds = [2, 3, 5, 7, 11, 13, 17, 23, 29, 31, 37]
score_list = []

for fname in ["anneal.arff", "audiology.arff", "autos.arff", "credit-a.arff", \
              "hypothyroid.arff", "letter.arff", "microarray.arff", "vote.arff"]:
    dataset = arff.load(open(fname), 'r')
    data = np.array(dataset['data'])

    X = np.array(data)[:, :-1]
    Y = np.array(data)[:, -1]

    # turn unknown/none/? into a separate value
    for i, j in product(range(len(data)), range(len(data[0]) - 1)):
        if X[i, j] is None:
            X[i, j] = len(dataset['attributes'][j][1])

    # a hack to turn negative categories positive for autos.arff
    for i in range(Y.shape[0]):
        if Y[i] < 0:
            Y[i] += 7

    # identify and extract categorical/non-categorical features
    categorical, non_categorical = [], []
    for i in range(len(dataset['attributes']) - 1):
        if isinstance(dataset['attributes'][i][1], str):
            non_categorical.append(X[:, i])
        else:
            categorical.append(X[:, i])

    categorical = np.array(categorical).T
    non_categorical = np.array(non_categorical).T

    if categorical.shape[0] == 0:
        transformed_X = non_categorical
    else:
        # encode categorical features
        encoder = OneHotEncoder(n_values = 'auto',
                                categorical_features = 'all',
                                dtype = np.int32,
                                sparse = False,
                                handle_unknown = 'error')
        encoder.fit(categorical)
        categorical = encoder.transform(categorical)
        if non_categorical.shape[0] == 0:
            transformed_X = categorical
        else:
            transformed_X = np.concatenate((categorical, non_categorical), axis = 1)

    # concatenate the feature array and the labels for resampling purpose
    Y = np.array([Y], dtype = np.int)
    input_data = np.concatenate((transformed_X, Y.T), axis = 1)

    # build the models
    models = [DummyClassifier(strategy = 'most_frequent')] \
              + [KNeighborsClassifier(n_neighbors = 1, algorithm = "brute")] * 5 \
              + [DecisionTreeClassifier()] * 5

    # resample and run cross validation
    portion = [1.0, 0.1, 0.25, 0.5, 0.75, 1.0, 0.1, 0.25, 0.5, 0.75, 1.0]
    sample, scores = [None] * 11, [None] * 11
    for i in range(11):
        sample[i] = resample(input_data,
                             replace = False,
                             n_samples = int(portion[i] * input_data.shape[0]),
                             random_state = seeds[i])
        score = [None] * 10
        for j in range(10):
            score[j] = np.mean(cross_val_score(models[i],
                                               sample[i][:, :-1],
                                               sample[i][:, -1].astype(np.int),
                                               scoring = 'accuracy',
                                               cv = KFold(10, True, seeds[j])))
        scores[i] = score

    score_list.append((fname[:-5], 1 - np.array(scores)))

# print the results
header = ["{:^123}".format("Nearest Neighbour Results") + '\n' + '-' * 123  + '\n' + \
          "{:^15} | {:^10} | {:^16} | {:^16} | {:^16} | {:^16} | {:^16}" \
          .format("Dataset", "Baseline", "10%", "25%", "50%", "75%", "100%"),
          "{:^123}".format("Decision Tree Results") + '\n' + '-' * 123  + '\n' + \
          "{:^15} | {:^10} | {:^16} | {:^16} | {:^16} | {:^16} | {:^16}" \
          .format("Dataset", "Baseline", "10%", "25%", "50%", "75%", "100%")]
offset = [1, 6]

for k in range(2):
    print(header[k])
    for i in range(8):
        scores = score_list[i][1]
        p_value = [None] * 5
        for j in range(5):
            _, p_value[j] = ttest_ind(scores[0], scores[j + offset[k]], equal_var = False)

        print("{:<15} | {:>10.2%}".format(score_list[i][0], np.mean(scores[0])), end = '')
        for j in range(5):
            print(" | {:>6.2%} ({:>5.2%}) {}" .format(np.mean(scores[j + offset[k]]),
                                                      np.std(scores[j + offset[k]]),
                                                      '*' if p_value[j] < 0.05 else ' '), end = '')
        print()
    print()

                                                 Nearest Neighbour Results                                                 
---------------------------------------------------------------------------------------------------------------------------
    Dataset     |  Baseline  |       10%        |       25%        |       50%        |       75%        |       100%      
anneal          |     23.83% | 20.31% (0.94%) * | 18.00% (1.33%) * | 11.12% (0.66%) * |  9.11% (0.37%) * |  7.44% (0.44%) *
audiology       |     74.77% | 60.17% (2.17%) * | 42.00% (2.56%) * | 31.85% (2.13%) * | 29.62% (1.78%) * | 26.47% (1.81%) *
autos           |     67.35% | 64.50% (1.50%) * | 61.40% (2.21%) * | 65.96% (2.02%)   | 52.92% (2.39%) * | 57.37% (0.95%) *
credit-a        |     44.49% | 39.98% (1.05%) * | 41.35% (0.99%) * | 32.04% (1.50%) * | 34.63% (0.79%) * | 34.71% (0.73%) *
hypothyroid     |      7.71% |  8.27% (0.52%) * |  7.33% (0.18%) * |  4.74% (0.14%) * |  5.01% (0.13%) * |  4.79% (0.10%) *
letter  

In [11]:

errs=[]
mean = 0
for k in range(2): 
    errs=[np.mean([np.mean(score_list[i][1][j+offset[k]]) for i in range(8)]) for j in range(5)]
    err0 = np.mean([score_list[i][1][0] for i in range(8)])
    print([(err0-e)/err0*100 for e in errs])


errs = np.mean([(np.mean(score_list[i][1][0])-np.mean(score_list[i][1][1]))/np.mean(score_list[i][1][0])*100 for i in range(8)])
print(errs)
errs = np.mean([(np.mean(score_list[i][1][0])-np.mean(score_list[i][1][5]))/np.mean(score_list[i][1][0])*100 for i in range(8)])
print(errs)
errs = np.mean([(np.mean(score_list[i][1][0])-np.mean(score_list[i][1][6]))/np.mean(score_list[i][1][0])*100 for i in range(8)])
print(errs)
errs = np.mean([(np.mean(score_list[i][1][0])-np.mean(score_list[i][1][10]))/np.mean(score_list[i][1][0])*100 for i in range(8)])
print(errs)
    

[31.557774316174719, 40.558189843069975, 49.792104941866995, 51.932469021123261, 52.013933334915599]
[36.161000883402004, 53.095090487293405, 60.334961248573919, 66.159896624432804, 67.335859969459477]
23.6029803015


In [10]:
# code for question 2

import arff, numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import sys
import warnings

# fixed random seed
np.random.seed(1)

def warn(*args, **kwargs):
    pass

def label_enc(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

def features_encoders(features,categorical_features='all'):
    n_samples, n_features = features.shape
    label_encoders = [preprocessing.LabelEncoder() for _ in range(n_features)]

    X_int = np.zeros_like(features, dtype=np.int)

    for i in range(n_features):
        feature_i = features[:, i]
        label_encoders[i].fit(feature_i)
        X_int[:, i] = label_encoders[i].transform(feature_i)
        
    enc = preprocessing.OneHotEncoder(categorical_features=categorical_features)
    return enc.fit(X_int),label_encoders

def feature_transform(features,label_encoders, one_hot_encoder):
    
    n_samples, n_features = features.shape
    X_int = np.zeros_like(features, dtype=np.int)
    
    for i in range(n_features):
        feature_i = features[:, i]
        X_int[:, i] = label_encoders[i].transform(feature_i)

    return one_hot_encoder.transform(X_int).toarray()

warnings.warn = warn

class DataFrameImputer(TransformerMixin):

    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


def load_data(path):
    dataset = arff.load(open(path, 'r'))
    data = np.array(dataset['data'])
    data = pd.DataFrame(data)
    data = DataFrameImputer().fit_transform(data).values
    attr = dataset['attributes']

    # mask categorical features
    masks = []
    for i in range(len(attr)-1):
        if attr[i][1] != 'REAL':
            masks.append(i)
    return data, masks

def preprocess(data,masks, noise_ratio):
    # split data
    train_data, test_data = train_test_split(data,test_size=0.3,random_state=0)

    # test data
    test_features = test_data[:,0:test_data.shape[1]-1]
    test_labels = test_data[:,test_data.shape[1]-1]

    # training data
    features = train_data[:,0:train_data.shape[1]-1]
    labels = train_data[:,train_data.shape[1]-1]

    classes = list(set(labels))
    # categorical features need to be encoded
    if len(masks):
        one_hot_enc, label_encs = features_encoders(data[:,0:data.shape[1]-1],masks)
        test_features = feature_transform(test_features,label_encs,one_hot_enc)
        features = feature_transform(features,label_encs,one_hot_enc)

    le = label_enc(data[:,data.shape[1]-1])
    labels = le.transform(train_data[:,train_data.shape[1]-1])
    test_labels = le.transform(test_data[:,test_data.shape[1]-1])
    
    # add noise
    np.random.seed(1234)
    noise = np.random.randint(len(classes)-1, size=int(len(labels)*noise_ratio))+1
    
    noise = np.concatenate((noise,np.zeros(len(labels) - len(noise),dtype=np.int)))
    labels = (labels + noise) % len(classes)

    return features,labels,test_features,test_labels

# load data
paths = ['balance-scale','primary-tumor',
         'glass','heart-h']
noise = [0,0.2,0.5,0.8]

scores = []
params = []

for path in paths:
    score = []
    param = []
    path += '.arff'
    data, masks = load_data(path)
    
    # training on data with %50 noise and default parameters
    features, labels, test_features, test_labels = preprocess(data, masks, 0.5)
    tree = DecisionTreeClassifier(random_state=0,min_samples_leaf=2, min_impurity_decrease=0)
    tree.fit(features, labels)
    tree_preds = tree.predict(test_features)
    tree_performance = accuracy_score(test_labels, tree_preds)
    score.append(tree_performance)
    param.append(tree.get_params()['min_samples_leaf'])
    
    # training on data with noise %0, %20, %50, %80
    for noise_ratio in noise:
        features, labels, test_features, test_labels = preprocess(data, masks, noise_ratio)
        param_grid = {'min_samples_leaf': np.arange(2,30,5)}

        grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid,cv=10,return_train_score=True)
        grid_tree.fit(features, labels)

        estimator = grid_tree.best_estimator_
        tree_preds = grid_tree.predict(test_features)
        tree_performance = accuracy_score(test_labels, tree_preds)
        score.append(tree_performance)
        param.append(estimator.get_params()['min_samples_leaf'])

    scores.append(score)
    params.append(param)

# print the results
header = "{:^123}".format("Decision Tree Results") + '\n' + '-' * 123  + '\n' + \
"{:^15} | {:^16} | {:^16} | {:^16} | {:^16} | {:^16} |".format("Dataset", "Default", "0%", "20%", "50%", "80%")


# print result table
print(header)
for i in range(len(scores)):
    #scores = score_list[i][1]
    print("{:<16}".format(paths[i]),end="")
    for j in range(len(params[i])):
        print("|  {:>6.2%} ({:>2})     " .format(scores[i][j],params[i][j]),end="")
    print('|\n')
print('\n')

                                                   Decision Tree Results                                                   
---------------------------------------------------------------------------------------------------------------------------
    Dataset     |     Default      |        0%        |       20%        |       50%        |       80%        |
balance-scale   |  36.70% ( 2)     |  76.06% ( 2)     |  71.28% (12)     |  65.43% (27)     |  18.09% (27)     |

primary-tumor   |  25.49% ( 2)     |  37.25% (12)     |  42.16% (12)     |  43.14% (12)     |  26.47% ( 7)     |

glass           |  44.62% ( 2)     |  69.23% ( 7)     |  66.15% (22)     |  35.38% (17)     |  29.23% (17)     |

heart-h         |  35.96% ( 2)     |  67.42% ( 7)     |  78.65% (22)     |  56.18% (17)     |  20.22% (27)     |





In [12]:
# code for question 3

import arff,numpy as np
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import metrics

#--------------Show the attributes--------------

dataset = arff.load(open('houses.arff',"r",encoding = "ISO-8859-1")) 
attributes = np.array(dataset['attributes'])
print('attributes: ',attributes)

#--------------linear regression--------------

regr = linear_model.LinearRegression()
data = np.array(dataset['data'])
houses_X = data[:,1:] #X vector
houses_Y = data[:,0] #Y vector

regr.fit(houses_X, houses_Y)
intercept = regr.intercept_

print('Intercept:\n%.2e' % intercept,end='\n')
print('Coefficients:')
for coef in regr.coef_:
    print('%.2e' % coef,end=" ")

file = open('q3.out','a')
file.write('Intercept:\n%.2e\n' % intercept)
file.write('Coefficients:\n')
for coef in regr.coef_:
    file.write('%.2e' % coef)
    file.write(' ')
file.write('\n')

#--------------10-fold cross validation--------------

predicted = cross_val_predict(regr, houses_X, houses_Y, cv=10)
RMSE = np.sqrt(metrics.mean_squared_error(houses_Y, predicted))

print ('\nRMSE:\n%.2e\n' % RMSE)
file.write('RMSE:\n%.2e\n' % RMSE)
file.close()



attributes:  [['median_house_value' 'REAL']
 ['median_income' 'REAL']
 ['housing_median_age' 'REAL']
 ['total_rooms' 'REAL']
 ['total_bedrooms' 'REAL']
 ['population' 'REAL']
 ['households' 'REAL']
 ['latitude' 'REAL']
 ['longitude' 'REAL']]
Intercept:
-3.59e+06
Coefficients:
4.02e+04 1.16e+03 -8.18e+00 1.13e+02 -3.85e+01 4.83e+01 -4.26e+04 -4.28e+04 
RMSE:
7.13e+04



In [13]:
# code for question 4

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2

df_trte = pd.read_csv('snippets_all.csv')
df_tr = pd.read_csv('snippets_train.csv')
df_te = pd.read_csv('snippets_test.csv')

# set up the vocabulary (the global set of "words" or tokens) for training and test datasets
vectorizer = CountVectorizer()
vectorizer.fit(df_trte.snippet)

# apply this vocabulary to transform the text snippets to vectors of word counts
X_train = vectorizer.transform(df_tr.snippet)
X_test = vectorizer.transform(df_te.snippet)
y_train = df_tr.section
y_test = df_te.section

# Debugging
# print("X train: ", X_train.shape)
# print("X test: ", X_test.shape)
# print("Y train: ", y_train.shape)
# print("Y test: ", y_test.shape)

# learn a Naive Bayes classifier on the training set
clf = MultinomialNB(alpha=0.5)
MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
score_train = metrics.accuracy_score(y_train, pred_train)
pred_test = clf.predict(X_test)
score_test = metrics.accuracy_score(y_test, pred_test)
print("Train/test accuracy using all features: ", score_train, score_test)


# Use Chi^2 to select top 10000 features
ch2_10000 = SelectKBest(chi2, k=10000)
ch2_10000.fit(X_train, y_train)
# Project training data onto top 10000 selected features
X_train_kbest_10000 = ch2_10000.transform(X_train)
# Train NB Classifier using top 10 selected features
clf_kbest_10000 = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
clf_kbest_10000.fit(X_train_kbest_10000,y_train)
# Predictive accuracy on training set
pred_train_kbest_10000 = clf_kbest_10000.predict(X_train_kbest_10000)
score_train_kbest_10000 = metrics.accuracy_score(y_train,pred_train_kbest_10000)
# Project test data onto top 10000 selected features
X_test_kbest_10000 = ch2_10000.transform(X_test)
# Predictive accuracy on test set
pred_test_kbest_10000 = clf_kbest_10000.predict(X_test_kbest_10000)
score_test_kbest_10000 = metrics.accuracy_score(y_test,pred_test_kbest_10000)
print("Train/test accuracy for top 10K features", score_train_kbest_10000, score_test_kbest_10000)

# Use Chi^2 to select top 1000 features
ch2_1000 = SelectKBest(chi2, k=1000)
ch2_1000.fit(X_train, y_train)
# Project training data onto top 1000 selected features
X_train_kbest_1000 = ch2_1000.transform(X_train)
# Train NB Classifier using top 1000 selected features
clf_kbest_1000 = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
clf_kbest_1000.fit(X_train_kbest_1000,y_train)
# Predictive accuracy on training set
pred_train_kbest_1000 = clf_kbest_1000.predict(X_train_kbest_1000)
score_train_kbest_1000 = metrics.accuracy_score(y_train,pred_train_kbest_1000)
# Project test data onto top 1000 selected features
X_test_kbest_1000 = ch2_1000.transform(X_test)
# Predictive accuracy on test set
pred_test_kbest_1000 = clf_kbest_1000.predict(X_test_kbest_1000)
score_test_kbest_1000 = metrics.accuracy_score(y_test,pred_test_kbest_1000)
print("Train/test accuracy for top 1K features", score_train_kbest_1000, score_test_kbest_1000)

# Use Chi^2 to select top 100 features
ch2_100 = SelectKBest(chi2, k=100)
ch2_100.fit(X_train, y_train)
# Project training data onto top 100 selected features
X_train_kbest_100 = ch2_100.transform(X_train)
# Train NB Classifier using top 100 selected features
clf_kbest_100 = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
clf_kbest_100.fit(X_train_kbest_100,y_train)
# Predictive accuracy on training set
pred_train_kbest_100 = clf_kbest_100.predict(X_train_kbest_100)
score_train_kbest_100 = metrics.accuracy_score(y_train,pred_train_kbest_100)
# Project test data onto top 100 selected features
X_test_kbest_100 = ch2_100.transform(X_test)
# Predictive accuracy on test set
pred_test_kbest_100 = clf_kbest_100.predict(X_test_kbest_100)
score_test_kbest_100 = metrics.accuracy_score(y_test,pred_test_kbest_100)
print("Train/test accuracy for top 100 features", score_train_kbest_100, score_test_kbest_100)

# Use Chi^2 to select top 10 features
ch2_10 = SelectKBest(chi2, k=10)
ch2_10.fit(X_train, y_train)
# Project training data onto top 10 selected features
X_train_kbest_10 = ch2_10.transform(X_train)
# Train NB Classifier using top 10 selected features
clf_kbest_10 = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
clf_kbest_10.fit(X_train_kbest_10,y_train)
# Predictive accuracy on training set
pred_train_kbest_10 = clf_kbest_10.predict(X_train_kbest_10)
score_train_kbest_10 = metrics.accuracy_score(y_train,pred_train_kbest_10)
# Project test data onto top 10 selected features
X_test_kbest_10 = ch2_10.transform(X_test)
# Predictive accuracy on test set
pred_test_kbest_10 = clf_kbest_10.predict(X_test_kbest_10)
score_test_kbest_10 = metrics.accuracy_score(y_test,pred_test_kbest_10)
print("Train/test accuracy for top 10 features", score_train_kbest_10, score_test_kbest_10)



Train/test accuracy using all features:  0.979324055666 0.805263157895
Train/test accuracy for top 10K features 0.971769383698 0.805263157895
Train/test accuracy for top 1K features 0.953876739563 0.694298245614
Train/test accuracy for top 100 features 0.718787276342 0.463157894737
Train/test accuracy for top 10 features 0.409244532803 0.189035087719
