In [90]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing
filepath = './data/competition_data/'
files = [x for x in os.listdir(filepath)] 
files

['bill_of_materials.csv',
 'components.csv',
 'comp_adaptor.csv',
 'comp_boss.csv',
 'comp_elbow.csv',
 'comp_float.csv',
 'comp_hfl.csv',
 'comp_nut.csv',
 'comp_other.csv',
 'comp_sleeve.csv',
 'comp_straight.csv',
 'comp_tee.csv',
 'comp_threaded.csv',
 'specs.csv',
 'test_set.csv',
 'train_set.csv',
 'tube.csv',
 'tube_end_form.csv',
 'type_component.csv',
 'type_connection.csv',
 'type_end_form.csv']

In [91]:
billdf = pd.read_csv(filepath + files[0])
billdf.head()

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,TA-00001,C-1622,2,C-1629,2.0,,,,,,,,,,,,
1,TA-00002,C-1312,2,,,,,,,,,,,,,,
2,TA-00003,C-1312,2,,,,,,,,,,,,,,
3,TA-00004,C-1312,2,,,,,,,,,,,,,,
4,TA-00005,C-1624,1,C-1631,1.0,C-1641,1.0,,,,,,,,,,


In [92]:
tubedf = pd.read_csv(filepath + 'tube.csv')
tubedf.head()

Unnamed: 0,tube_assembly_id,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other
0,TA-00001,SP-0035,12.7,1.65,164,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0
1,TA-00002,SP-0019,6.35,0.71,137,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0
2,TA-00003,SP-0019,6.35,0.71,127,7,19.05,N,N,N,N,EF-008,EF-008,0,0,0
3,TA-00004,SP-0019,6.35,0.71,137,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0
4,TA-00005,SP-0029,19.05,1.24,109,4,50.8,N,N,N,N,EF-003,EF-003,0,0,0


In [93]:
traindf = pd.read_csv(filepath + 'train_set.csv', parse_dates = ['quote_date'])
testdf = pd.read_csv(filepath + 'test_set.csv', parse_dates = ['quote_date'])

## Create Training Set and Test Set

As stated before, we will first focus on the `bill_of_materials.csv` and `tube.csv` files

In [94]:
train = pd.merge(traindf,billdf, on = 'tube_assembly_id')
test = pd.merge(testdf,billdf, on = 'tube_assembly_id')
train = pd.merge(train,tubedf, on = 'tube_assembly_id')
test = pd.merge(test,tubedf, on = 'tube_assembly_id')

## Cleaning data frames

- Make a separate vector of cost for training
- drop the 'id' column from test because it looks useless. It's basically the same thing as the index
- replace NaN values in `quantity` columns with the value of 0
- Keep in mind: There are `NaN` values in 'material_id' and 'component_id'. I did not touch them as requested.


In [95]:
cost = train['cost'].values
train = train.drop('cost', axis = 1)
test = test.drop('id',axis = 1)

In [96]:
print 'Train columns'
print
print train.columns
print '\n\n'
print 'Test columns'
print test.columns
print '\n\n'
print 'Train shape:', train.shape
print 'Test shape:', test.shape

Train columns

Index([u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity',
       u'component_id_1', u'quantity_1', u'component_id_2', u'quantity_2',
       u'component_id_3', u'quantity_3', u'component_id_4', u'quantity_4',
       u'component_id_5', u'quantity_5', u'component_id_6', u'quantity_6',
       u'component_id_7', u'quantity_7', u'component_id_8', u'quantity_8',
       u'material_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'end_a_1x', u'end_a_2x', u'end_x_1x', u'end_x_2x',
       u'end_a', u'end_x', u'num_boss', u'num_bracket', u'other'],
      dtype='object')



Test columns
Index([u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity',
       u'component_id_1', u'quantity_1', u'component_id_2', u'quantity_2',
       u'component_id_3', u'quantity_3', u'component_id_4', u'quantity_4',
       u'comp

In [97]:
for x in range(6,23,2):
    column = train.columns[x]
    train[column] = train[column].replace('NaN',0)
train.iloc[:,range(6,23,2)].head()    

Unnamed: 0,quantity,quantity_1,quantity_2,quantity_3,quantity_4,quantity_5,quantity_6,quantity_7,quantity_8
0,1,2,0,0,0,0,0,0,0
1,2,2,0,0,0,0,0,0,0
2,5,2,0,0,0,0,0,0,0
3,10,2,0,0,0,0,0,0,0
4,25,2,0,0,0,0,0,0,0


In [98]:
for x in range(6,23,2):
    column = test.columns[x]
    test[column] = test[column].replace('NaN',0)
test.iloc[:,range(6,23,2)].head()  

Unnamed: 0,quantity,quantity_1,quantity_2,quantity_3,quantity_4,quantity_5,quantity_6,quantity_7,quantity_8
0,1,2,2,0,0,0,0,0,0
1,2,2,2,0,0,0,0,0,0
2,5,2,2,0,0,0,0,0,0
3,10,2,2,0,0,0,0,0,0
4,25,2,2,0,0,0,0,0,0


## Other processing

I found that scikit-learn does not handle strings in categorical data well.
[link here on kaggle](https://www.kaggle.com/c/titanic/forums/t/5379/handling-categorical-data-with-sklearn)

I'll use the LabelEncoder method as well. I won't touch the `NaN` or 'quote_date' values. Feel free to use what I did above, which is what you asked for.

In [102]:
from sklearn import preprocessing
le = sklearn.preprocessing.LabelEncoder()
for x in [0, 1, 5, 7, 9, 11, 13, 15, 17, 19, 21, 22, 28, 29, 30, 31, 32, 33, 34]:
    col = train.columns[x]
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])

In [105]:
print len(train.component_id_1.unique())
print train.shape

643
(30213, 38)


In [1]:
# From Bharat analysis (check txt file)

x = [4219,3236,3085,2973,2819,875,602,571,540,436,402,360]
sumfall = x[:5]
print sum(x)
print sum(sumfall)
print float(sum(sumfall))/sum(x)

print 18660/19800.0

20118
16332
0.811810319117
0.942424242424
