Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
benhamner committed Apr 30, 2012
0 parents commit 271ed13
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
*.csv
*.m~
10 changes: 10 additions & 0 deletions README.md
@@ -0,0 +1,10 @@
Winning Code for the EMC Data Science Global Hackathon (Air Quality Prediciton)
-------------------------------------------------------------------------------

https://www.kaggle.com/c/dsg-hackathon

To execute this code,

1. Download TrainingData.csv from https://www.kaggle.com/c/dsg-hackathon/data and put it in this folder
2. Run make_predictions.m from the Matlab command prompt
3. Copy the resulting predictions from
45 changes: 45 additions & 0 deletions features.m
@@ -0,0 +1,45 @@
function [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset)

time_back = 8;

fea_train = zeros(40000, 3 + 89*time_back);
fea_test = zeros(500, 3 + 89*time_back);

train_targets = zeros(40000, 39);

test_chunk_id = [];

fea_cnt = 0;
test_cnt = 0;
for i=1:size(data,1)-time_back-prediction_offset+1
if data(i,2)==data(i+time_back+prediction_offset-1,2)
fea_cnt = fea_cnt + 1;
fea_train(fea_cnt,1:3) = data(i, 4:6);
this_fea = data(i:i+time_back-1,7:95);
fea_train(fea_cnt,4:end) = this_fea(:)';

train_targets(fea_cnt, :) = data(i+time_back+prediction_offset-1, 95-39+1:95);
end

if data(i,2) ~= data(i+1,2)
test_cnt = test_cnt + 1;
i_back = i - time_back + 1;
fea_test(test_cnt,1:3) = data(i_back, 4:6);

this_fea = data(i_back:i_back+time_back-1,7:95);
fea_test(test_cnt,4:end) = this_fea(:)';
test_chunk_id(end+1) = data(i_back,2);
end
end

test_cnt = test_cnt + 1;
i_back = size(data,1) - time_back + 1;
fea_test(test_cnt,1:3) = data(i_back, 4:6);

this_fea = data(i_back:i_back+time_back-1,7:95);
fea_test(test_cnt,4:end) = this_fea(:)';
test_chunk_id(end+1) = data(i_back,2);

train_targets = train_targets(1:fea_cnt,:);
fea_train = fea_train(1:fea_cnt,:);
fea_test = fea_test(1:test_cnt, :);
36 changes: 36 additions & 0 deletions make_predictions.m
@@ -0,0 +1,36 @@
function make_predictions()

prediction_offsets = [1 2 3 4 5 10 17 24 48 72];

data = read_data();

test_predictions = zeros(2100,39);

matlabpool open 4
options = statset('UseParallel','always');

for p=1:10
prediction_offset = prediction_offsets(p);
[fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset);
tic
for i=1:size(train_targets,2)
[p,i]
locs = find(train_targets(:,i)>=0);
tm = TreeBagger(12,fea_train(locs,:),train_targets(locs,i),'method','regression','minleaf',200,'options',options);
pred = predict(tm,fea_test);
for j=1:length(test_chunk_id)
test_predictions(test_chunk_id(j)*10-10+p,i) = pred(j);
end
end
toc
end

for i=1:210
if isempty(find(i==test_chunk_id))
for j=1:39
test_predictions( (i-1)*10+1:i*10,j) = median(test_predictions(:,j));
end
end
end

dlmwrite('predictions.csv',test_predictions);
26 changes: 26 additions & 0 deletions read_data.m
@@ -0,0 +1,26 @@
function data = read_data()

fid = fopen('TrainingData.csv');
fgetl(fid);

data = zeros(37821,95);
days = {'"Saturday"','"Sunday"','"Monday"','"Tuesday"','"Wednesday"','"Thursday"','"Friday"'};
row_cnt = 0;

while ~feof(fid)
row_cnt = row_cnt + 1
line = fgetl(fid);
C = strread(line,'%s','delimiter',',');
for i=1:95
if i==5
data(row_cnt,5) = find(strcmp(days,C{5}));
else
if strcmp(C{i},'NA')
data(row_cnt,i) = -1000000;
else
data(row_cnt,i) = str2num(C{i});
end
end
end
end

0 comments on commit 271ed13

Please sign in to comment.