# CS155: Miniproject 1
Kavya Sreedhar, Audrey Wang, Anne Zhou

In [17]:
import numpy as np 
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [18]:
# Seed the random number generator.
np.random.seed(1)

# Define function for loading files
def load_data(filename, skiprows=1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [4]:
# Load data.
train = load_data('training_data.txt')
X_test = load_data('test_data.txt')

X_train = train[:, 1:]
y_train = train[:, 0]
N_train = len(X_train)
N_test = len(X_test)

In [19]:
# Normalize data.
max_vals = X_train.max(axis=0)
X_train = X_train / max_vals
X_test = X_test / max_vals

In [21]:
# Find cross-validation score of different models to determine best one.
X_val = X_train[:4000]
y_val = y_train[:4000]
types = ['svm', 'logistic regression', 'random forest', 'gradient boost']
scores = []

sv = svm.SVC()
log = LogisticRegression()
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()
scores.append(cross_val_score(sv, X_val, y_val))
scores.append(cross_val_score(log, X_val, y_val))
scores.append(cross_val_score(random_forest, X_val, y_val))
scores.append(cross_val_score(gradient_boost, X_val, y_val))

for i in range(len(types)):
    print('%s: %f' % (types[i], np.mean(scores[i])))
    print(scores[i])
    print()

svm: 0.530500
[ 0.53073463  0.5303826   0.5303826 ]

logistic regression: 0.829997
[ 0.84332834  0.8372093   0.80945236]

random forest: 0.753246
[ 0.76911544  0.76294074  0.72768192]

gradient boost: 0.786494
[ 0.81034483  0.79969992  0.74943736]



In [29]:
# Tweak parameters for best classifiers.
c_arr = [0.3, 0.5, 0.8, 1.0, 2.0, 5.0, 10.0]
log1 = LogisticRegression(class_weight='balanced')
score1 = cross_val_score(log1, X_train, y_train)
print('only balanced: %f' % np.mean(score1))
print(score1)

for i in range(len(c_arr)):
    print(c_arr[i])
    log2 = LogisticRegression(C=c_arr[i])
    score2 = cross_val_score(log2, X_train, y_train)
    print('only c: %f' % np.mean(score2))
    print(score2)
    log4 = LogisticRegression(C=c_arr[i], class_weight='balanced')
    score4 = cross_val_score(log4, X_train, y_train)
    print('both c and balanced: %f' % np.mean(score4))
    print(score4)
    print()
    


only balanced: 0.847000
[ 0.84970751  0.84490775  0.84638464]
0.3
only c: 0.846350
[ 0.84865757  0.84325784  0.84713471]
both c and balanced: 0.846500
[ 0.85015749  0.84280786  0.84653465]

0.5
only c: 0.846800
[ 0.84940753  0.84460777  0.84638464]
both c and balanced: 0.847500
[ 0.84985751  0.84460777  0.8480348 ]

0.8
only c: 0.846250
[ 0.84925754  0.84415779  0.84533453]
both c and balanced: 0.846750
[ 0.84865757  0.84580771  0.84578458]

1.0
only c: 0.846100
[ 0.84940753  0.84310784  0.84578458]
both c and balanced: 0.847000
[ 0.84970751  0.84490775  0.84638464]

2.0
only c: 0.846700
[ 0.84970751  0.84445778  0.84593459]
both c and balanced: 0.847300
[ 0.84955752  0.84520774  0.84713471]

5.0
only c: 0.845450
[ 0.84580771  0.84370781  0.84683468]
both c and balanced: 0.845550
[ 0.84610769  0.84355782  0.8469847 ]

10.0
only c: 0.843050
[ 0.84355782  0.84040798  0.84518452]
both c and balanced: 0.843250
[ 0.84355782  0.84025799  0.84593459]



In [30]:
# Make predictions to output file.
clf = LogisticRegression(C=2.0, class_weight='balanced')
clf.fit(X_train, y_train)
predictions = clf.predict(X_test).flatten()
f = open('predictions.txt', 'w')
f.write('Id,Prediction\n')
for i in range(len(predictions)):
    f.write('%d,%d\n' % ((i + 1), predictions[i]))
f.close()

In [None]:
# import tensorflow as tf 
# import keras
# from keras.models import Sequential
# from keras.layers.core import Dense, Activation, Flatten, Dropout
# from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
# from keras import regularizers

In [5]:
# N = 1000 # Number of parameters

# # Define the model.
# model = Sequential()
# model.add(Dense(1000, input_shape=(N,)))
# model.add(Activation('relu'))
# model.add(Dropout(0.1))

# model.add(Dense(900))
# model.add(BatchNormalization())
# model.add(Activation('relu'))

# model.add(Dense(800))
# model.add(Activation('relu'))
# model.add(Dropout(0.1))

# model.add(Dense(10))
# model.add(Dense(1))

# # Print number of params
# model.count_params()

# # Compile the model
# model.compile(optimizer='adam',
#               loss='mse',
#               metrics=['accuracy'])

# # Train the model for 1 epoch
# history = model.fit(X_train, y_train, epochs=1, batch_size=32)

# # Evaluate the model
# model.evaluate(x=X_train, y=y_train)

Epoch 1/1


[0.074854408216476445, 0.92405000000000004]

10000
