In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import scipy.io as sio
import csv
import pylab as plt
import random

In [3]:
def save_prediction(pred, filename):
    """
    Save the output of the regression as csv file
    """
    pred=pred.flatten()
    totalRows = pred.shape[0] 
    filename = filename+'.csv'
    with open(filename, mode = 'w') as result:
        csv_writer = csv.writer(result, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(['dataid', 'prediction'])
        
        for i in range(totalRows):
             csv_writer.writerow([str(i+1), pred[i]])

In [4]:
mat_contents = sio.loadmat(file_name='./MSdata.mat')
#mat_contents

In [5]:
test_x = mat_contents['testx']
train_x = mat_contents['trainx']
train_y = mat_contents['trainy']

In [6]:
max_train_x = np.amax(abs(train_x), axis=0)
norm_train_x = np.divide(train_x, max_train_x)

In [7]:
# Cut off features which are not meaningful
std_features = np.std(norm_train_x, axis=0)
index = np.where(std_features > 0.05)
traincut_x = norm_train_x[:, index[0]]

In [None]:
# SVM Regression
from sklearn import svm
svm = svm.SVR(kernel='rbf', cache_size=2000, C=1)
svm.fit(traincut_x[0:200000], train_y[0:200000].flatten())

In [11]:
svm_pred_y = svm.predict(np.divide(test_x, max_train_x))

NotFittedError: This SVR instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
save_prediction(svm_pred_y, 'svm_result')

In [None]:
# OLS (there are no penalty terms)
reg = linear_model.LinearRegression()
reg.fit(train_x, train_y)

In [None]:
ols_pred_y = reg.predict(test_x)

In [None]:
# Ridge Regression
ridge = linear_model.Ridge(alpha=.5)
ridge.fit(norm_train_x, train_y)

In [None]:
ridge_pred_y = ridge.predict(np.divide(test_x, max_train_x))

In [None]:
# Lasso Regression
lasso = linear_model.Lasso(alpha=0.2)
lasso.fit(norm_train_x, train_y)

In [None]:
lasso_pred_y = lasso.predict(np.divide(test_x, max_train_x))

### Feature selection

In [None]:
nsamp = 200
sample_index = random.sample(range(len(test_x)), nsamp)
subtest_x = train_x[sample_index,:]
subtest_y = train_y[sample_index]

In [None]:
subtest_x.shape

In [None]:
plt.hist(subtest_x[:,0])
plt.hist(subtest_x[:,10])
#plt.hist(subtest_x[:,20])

In [None]:
plt.hist(subtest_x[:,20], label='20')
plt.hist(subtest_x[:,21], label='21')
plt.hist(subtest_x[:,22], label='22')
plt.hist(subtest_x[:,24], label='24')
plt.title('Features at 20-24 index')
plt.legend()

In [None]:
plt.figure(figsize=(10,6))
imshow(subtest_x, vmin=0, vmax=100)

In [None]:
low_saturated= np.where(subtest_x<0)[1]

In [None]:
def remove_similar_feature_row(train_x, train_y, cutoff=100):
    """
    Go through test sample and remove data point with indistingishable features
    """
    std_features = np.std(train_x, axis=1)
    index = np.where(std_features>cutoff)
    X = train_x.copy()[index]
    Y = train_y.copy()[index]
    return X, Y

In [None]:
traincut_x, traincut_y= remove_similar_feature_row(train_x, train_y)

In [None]:
regcut = linear_model.LinearRegression()
regcut.fit(traincut_x, traincut_y)

In [None]:
testcut_pred_y = regcut.predict(test_x)

In [None]:
save_prediction(testcut_pred_y, 'cut_linearreg')

In [None]:
bayes_cut = linear_model.BayesianRidge()
bayes_cut.fit(traincut_x, traincut_y)

In [None]:
bayescut_pred_y = bayes_cut.predict(test_x)

In [None]:
save_prediction(bayescut_pred_y, 'cut_bayes')

In [None]:
len(np.where(low_saturated == 10)[0])

In [None]:
plt.hist(subtest_x[:,3])
plt.hist(subtest_x[:,4])
plt.hist(subtest_x[:,5])
plt.hist(subtest_x[:,6])

In [None]:
plt.hist(subtest_x[:,13])
plt.hist(subtest_x[:,14])
plt.hist(subtest_x[:,15])
plt.hist(subtest_x[:,16])


In [None]:
subtest_x[:,13].shape

In [None]:
len(np.where(saturated == 13)[0])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable 

In [None]:
train_y = train_y.astype('int32')

In [None]:
trainx = torch.from_numpy(train_x)
trainy = torch.from_numpy(train_y)

In [None]:
batch_size = 100

X = torch.from_numpy(train_x).double()
y = torch.from_numpy(train_y).double()

#train_data = torch.utils.data.TensorDataset(X, y)
#train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

X = Variable(X).float()
y = Variable(y).float()
#type(torch.LongTensor)
train_data = torch.utils.data.TensorDataset(X, y)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
lin = nn.Linear(90,1)
opt = torch.optim.SGD(lin.parameters(), lr=0.1)
loss_fn = F.mse_loss
loss = loss_fn(lin(X), y)

In [None]:
torch.nn.utils.clip_grad_norm_(lin.parameters(), 0.25)

In [None]:
def get_batch(source, i):
    bptt=20000
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
def fit(num_epochs, model, loss_fn, opt):
    # Define a utility function to train the model
    for epoch in range(num_epochs):
        for xb,yb in train_loader:
            # Generate predictions
            pred = model(xb)
            loss = loss_fn(pred, yb)
            # Perform gradient descent
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
            opt.step()
            opt.zero_grad()
    print('Training loss: ', loss_fn(model(X), y))

In [None]:
fit(1000, lin, loss_fn, opt)

In [None]:
testx = torch.from_numpy(test_x).double()
testX = Variable(testx).float()

In [None]:
pred = lin(testX)

In [None]:
pred

In [None]:
test_pred_y

In [None]:
def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def train():
    # Turn on training mode which enables dropout.
    X.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
# Save the result into 'result.csv'

totalRows = test_pred_y.shape[0]
with open('result.csv', mode='w') as result_file:
    csv_writer = csv.writer(result_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    
    csv_writer.writerow(['dataid','prediction'])
    
    for i in range(totalRows):
        csv_writer.writerow([str(i+1),test_pred_y[i][0]])
    