Skip to content
Browse files

Initial commit

  • Loading branch information...
0 parents commit 271ed137083b34d8ec5bf1fe924046e8375e6f1f @benhamner committed
Showing with 119 additions and 0 deletions.
  1. +2 −0 .gitignore
  2. +10 −0 README.md
  3. +45 −0 features.m
  4. +36 −0 make_predictions.m
  5. +26 −0 read_data.m
2 .gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.m~
10 README.md
@@ -0,0 +1,10 @@
+Winning Code for the EMC Data Science Global Hackathon (Air Quality Prediciton)
+-------------------------------------------------------------------------------
+
+https://www.kaggle.com/c/dsg-hackathon
+
+To execute this code,
+
+1. Download TrainingData.csv from https://www.kaggle.com/c/dsg-hackathon/data and put it in this folder
+2. Run make_predictions.m from the Matlab command prompt
+3. Copy the resulting predictions from
45 features.m
@@ -0,0 +1,45 @@
+function [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset)
+
+time_back = 8;
+
+fea_train = zeros(40000, 3 + 89*time_back);
+fea_test = zeros(500, 3 + 89*time_back);
+
+train_targets = zeros(40000, 39);
+
+test_chunk_id = [];
+
+fea_cnt = 0;
+test_cnt = 0;
+for i=1:size(data,1)-time_back-prediction_offset+1
+ if data(i,2)==data(i+time_back+prediction_offset-1,2)
+ fea_cnt = fea_cnt + 1;
+ fea_train(fea_cnt,1:3) = data(i, 4:6);
+ this_fea = data(i:i+time_back-1,7:95);
+ fea_train(fea_cnt,4:end) = this_fea(:)';
+
+ train_targets(fea_cnt, :) = data(i+time_back+prediction_offset-1, 95-39+1:95);
+ end
+
+ if data(i,2) ~= data(i+1,2)
+ test_cnt = test_cnt + 1;
+ i_back = i - time_back + 1;
+ fea_test(test_cnt,1:3) = data(i_back, 4:6);
+
+ this_fea = data(i_back:i_back+time_back-1,7:95);
+ fea_test(test_cnt,4:end) = this_fea(:)';
+ test_chunk_id(end+1) = data(i_back,2);
+ end
+end
+
+test_cnt = test_cnt + 1;
+i_back = size(data,1) - time_back + 1;
+fea_test(test_cnt,1:3) = data(i_back, 4:6);
+
+this_fea = data(i_back:i_back+time_back-1,7:95);
+fea_test(test_cnt,4:end) = this_fea(:)';
+test_chunk_id(end+1) = data(i_back,2);
+
+train_targets = train_targets(1:fea_cnt,:);
+fea_train = fea_train(1:fea_cnt,:);
+fea_test = fea_test(1:test_cnt, :);
36 make_predictions.m
@@ -0,0 +1,36 @@
+function make_predictions()
+
+prediction_offsets = [1 2 3 4 5 10 17 24 48 72];
+
+data = read_data();
+
+test_predictions = zeros(2100,39);
+
+matlabpool open 4
+options = statset('UseParallel','always');
+
+for p=1:10
+ prediction_offset = prediction_offsets(p);
+ [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset);
+ tic
+ for i=1:size(train_targets,2)
+ [p,i]
+ locs = find(train_targets(:,i)>=0);
+ tm = TreeBagger(12,fea_train(locs,:),train_targets(locs,i),'method','regression','minleaf',200,'options',options);
+ pred = predict(tm,fea_test);
+ for j=1:length(test_chunk_id)
+ test_predictions(test_chunk_id(j)*10-10+p,i) = pred(j);
+ end
+ end
+ toc
+end
+
+for i=1:210
+ if isempty(find(i==test_chunk_id))
+ for j=1:39
+ test_predictions( (i-1)*10+1:i*10,j) = median(test_predictions(:,j));
+ end
+ end
+end
+
+dlmwrite('predictions.csv',test_predictions);
26 read_data.m
@@ -0,0 +1,26 @@
+function data = read_data()
+
+fid = fopen('TrainingData.csv');
+fgetl(fid);
+
+data = zeros(37821,95);
+days = {'"Saturday"','"Sunday"','"Monday"','"Tuesday"','"Wednesday"','"Thursday"','"Friday"'};
+row_cnt = 0;
+
+while ~feof(fid)
+ row_cnt = row_cnt + 1
+ line = fgetl(fid);
+ C = strread(line,'%s','delimiter',',');
+ for i=1:95
+ if i==5
+ data(row_cnt,5) = find(strcmp(days,C{5}));
+ else
+ if strcmp(C{i},'NA')
+ data(row_cnt,i) = -1000000;
+ else
+ data(row_cnt,i) = str2num(C{i});
+ end
+ end
+ end
+end
+

0 comments on commit 271ed13

Please sign in to comment.
Something went wrong with that request. Please try again.