-
Notifications
You must be signed in to change notification settings - Fork 3
/
MatchTrainingToTestSet.py
56 lines (39 loc) · 2.16 KB
/
MatchTrainingToTestSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
__author__ = 'carrillo'
####################################
# Selects a subset of training examples to match the test set.
# Performs nearest neighbour search within the training set for each test set.
####################################
import pandas as pd
from sklearn.neighbors import KDTree
# Opens the data and replaces Depth nominal to numeric value if necessary.
def getData( fileName ):
data = pd.read_csv( fileName ,header=0)
data.replace( to_replace='Topsoil', value=0, inplace=True )
data.replace( to_replace='Subsoil', value=1, inplace=True )
return data
# Drops ID values in test and training and target values in training sets.
def extractFeatures( set ):
a = set.drop("PIDN",1)
if "P" in set:
a = a.drop(["Ca","P","pH","SOC","Sand"],1)
return a
# Finds the n nearest neighbors of the training set for each test sample. Returns the indices of the nearest neighbor set.
def findNearestNeighborsOfTestInTrain( test, train, n_neighbors ):
testFeatures = extractFeatures( test )
trainFeatures = extractFeatures( train )
kdt = KDTree( trainFeatures, leaf_size=30, metric='euclidean')
trainIndices = kdt.query( testFeatures, k=n_neighbors, return_distance=False)
indexSet = set( trainIndices.flatten().tolist() )
return( indexSet )
n_neighbors = 10 # Define the number of neighbors to be retrieved for each test set.
# Load data.
id = "DecaySubtractedPrincipalComponents"
train = getData('/Users/carrillo/workspace/Kaggle/resources/AfSIS/training' + id + '.csv')
test = getData('/Users/carrillo/workspace/Kaggle/resources/AfSIS/test' + id + '.csv')
# Get indices of matching training samples and subset training set
trainIndex = findNearestNeighborsOfTestInTrain( test, train, n_neighbors )
trainSubset = train.iloc()[ list( trainIndex) ,:]
# Write to file.
print 'Reduced %d training samples to %d by matching with test set.' % ( len( train ), len( trainSubset ) )
trainSubset.to_csv('/Users/carrillo/workspace/Kaggle/resources/AfSIS/training' + id + '_matchedWithTestSet.csv',header=True,index=False)
test.to_csv('/Users/carrillo/workspace/Kaggle/resources/AfSIS/test' + id + '_DepthNumeric.csv',header=True,index=False)