In [87]:
%matplotlib inline
import os, glob, sys
import pandas as pd
import numpy as np
import datetime as dt
from math import sqrt
from collections import Counter
import copy

In [493]:
allfiles = glob.glob(os.path.join('..','data\competition_data','*.csv'))
comp_files = glob.glob(os.path.join('..','data\competition_data','comp_*'))
newtrain_df = pd.read_csv(os.path.join('..','train.csv'),index_col = 0)

all_files = {}
for afile in allfiles:
    key = afile.split('\\')[-1].split('.')[0]
    all_files[key] = pd.read_csv(afile, header = 0)
all_files['train_set'] = pd.read_csv('../data/competition_data/train_set.csv', header = 0, parse_dates = ['quote_date'])
all_files['test_set'] = pd.read_csv('../data/competition_data/test_set.csv', header = 0, parse_dates = ['quote_date'])

# Components
comp_dict = {}
for compfile in comp_files:
    key = compfile.split('\\')[-1].split('.')[0]
    comp_dict[key] = pd.read_csv(compfile,header=0, index_col=0)

# The rest of the files
rest = ['components.csv','specs.csv','tube_end_form.csv','type_component.csv','type_connection.csv','type_end_form.csv']
restfile = []
for x in rest:
    restfile += [os.path.join('..','data\competition_data',x)]
rest_files = {}
for r in restfile:
    key = r.split('\\')[-1].split('.')[0]
    rest_files[key] = pd.read_csv(r ,header=0)


In [472]:
train = pd.merge(all_files['train_set'],all_files['bill_of_materials'], on = 'tube_assembly_id')
test = pd.merge(all_files['test_set'],all_files['bill_of_materials'], on = 'tube_assembly_id')
train = pd.merge(train,all_files['tube'], on = 'tube_assembly_id')
test = pd.merge(test,all_files['tube'], on = 'tube_assembly_id')

train['year'] = train['quote_date'].dt.year
train['month'] = train['quote_date'].dt.month
train['day'] = [date.days for date in train['quote_date'] - dt.date(1982,9,22)]

test['year'] = test['quote_date'].dt.year
test['month'] = test['quote_date'].dt.month
test['day'] = [date.days for date in test['quote_date'] - dt.date(1985,11,16)]


comp_id = ['component_id_'+str(i) for i in range(1,9)]
weight_id = ['weight_id_'+str(i) for i in range(1,9)]

# Add weight to train
for key1 in comp_id:
    i = comp_id.index(key1)
    weight = []
    for key2 in sorted(train[key1].unique(),reverse = True):
        for filename in comp_dict:
            if key2 in comp_dict[filename].index:
                weight.append(comp_dict[filename].loc[key2]['weight'])
    else:
        weight.append(np.nan)
    dfTemp = pd.DataFrame({key1 : sorted(train[key1].unique(),reverse = True),weight_id[i]: weight})
    train = pd.merge(train,dfTemp, how = 'left', on = key1)
train = train[['tube_assembly_id', 'supplier', 'year','month','day', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'cost', 'component_id_1', 'quantity_1','weight_id_1', 'component_id_2', 'quantity_2', 'weight_id_2', 'component_id_3', 'quantity_3', 'weight_id_3', 'component_id_4', 'quantity_4','weight_id_4', 'component_id_5', 'quantity_5', 'weight_id_5', 'component_id_6', 'quantity_6', 'weight_id_6', 'component_id_7', 'quantity_7', 'weight_id_7', 'component_id_8', 'quantity_8', 'weight_id_8', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other']]

# Add weight to test


for key in comp_id:  
    j = comp_id.index(key)
    temp = pd.DataFrame({comp_id[j]: []})
    for filename in comp_dict:
        for key2 in sorted(test[comp_id[j]].unique(),reverse = True):
            i = sorted(test[comp_id[j]].unique(),reverse = True).index(key2)
            if key2 in comp_dict[filename].index:
                temp.loc[i,comp_id[j]] = key2
                temp.loc[i,weight_id[j]] = comp_dict[filename].loc[key2]['weight']
    test = pd.merge(test,temp,how = 'left',on = comp_id[j]) 
test = test[['id', 'tube_assembly_id', 'supplier', 'year','month','day', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'component_id_1', 'quantity_1', 'weight_id_1', 'component_id_2', 'quantity_2', 'weight_id_2', 'component_id_3', 'quantity_3', 'weight_id_3', 'component_id_4', 'quantity_4', 'weight_id_4', 'component_id_5', 'quantity_5', 'weight_id_5', 'component_id_6', 'quantity_6', 'weight_id_6', 'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other']]

In [473]:
print list(train.columns)
print
print list(test.columns)

['tube_assembly_id', 'supplier', 'year', 'month', 'day', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'cost', 'component_id_1', 'quantity_1', 'weight_id_1', 'component_id_2', 'quantity_2', 'weight_id_2', 'component_id_3', 'quantity_3', 'weight_id_3', 'component_id_4', 'quantity_4', 'weight_id_4', 'component_id_5', 'quantity_5', 'weight_id_5', 'component_id_6', 'quantity_6', 'weight_id_6', 'component_id_7', 'quantity_7', 'weight_id_7', 'component_id_8', 'quantity_8', 'weight_id_8', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other']

['id', 'tube_assembly_id', 'supplier', 'year', 'month', 'day', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'component_id_1', 'quantity_1', 'weight_id_1', 'component_id_2', 'quantity_2', 'weight_id_2', 'component_id_3', 'quantity_3', 'weight_id_3', 'component_id_4', 'quantity_4', 'weight_i

**Add component columns to each `component_id_*`**

In [474]:
# TRAIN

def comp_column(name):
    if name not in ['component_id_'+str(i) for i in range(1,9)]:
        return "Error"
    else:
        temp = pd.DataFrame({name: []})
        for filename in comp_dict:
            tempdict = {x: [] for x in comp_dict[filename].drop('weight',axis = 1).columns}
            for key in sorted(train[name].unique(),reverse = True):
                i = sorted(train[name].unique(),reverse = True).index(key)
                if key in comp_dict[filename].index:
                    for col in tempdict.keys():
                        temp.loc[i,name] = key
                        temp.loc[i,col] = comp_dict[filename].drop('weight',axis = 1).loc[key][col]
        
        # Dropping any pure NaN columns
        tempcol = temp.columns.copy()
        dropcol = []
        for x in tempcol:
            if sum(pd.isnull(temp[x])) == len(temp[x]):
                dropcol.append(x)
        temp = temp.drop(dropcol,axis=1)
        
        # rename columns for number
        c = name.split('_')[-1]
        temp.columns = [temp.columns[i] + '_comp' + c if i > 0 else temp.columns[i] for i in range(len(temp.columns))]
        return temp
    
def mergeframe(comp_num):
    global train
    
    # Call comp_column()
    comp = 'component_id_'+str(comp_num)
    merge = comp_column(comp)
       
    train = pd.merge(train,merge, how = 'left', on = comp)
    startIndex = list(train.columns).index('weight_id_'+str(comp_num))
    x = list(train.columns.copy())
    newColOrder = []
    for i in range(len(x)):
        if i <= startIndex:
            newColOrder += [x[i]]
        elif i > startIndex and i < startIndex+len(merge.columns):
            newColOrder += [x[-len(merge.columns)+i-startIndex]]
        else:
            newColOrder += [x[i-len(merge.columns)+1]]
    return train[newColOrder]  

# TEST

def comp_columnTest(name):
    if name not in ['component_id_'+str(i) for i in range(1,9)]:
        return "Error"
    else:
        temp = pd.DataFrame({name: []})
        for filename in comp_dict:
            tempdict = {x: [] for x in comp_dict[filename].drop('weight',axis = 1).columns}
            for key in sorted(test[name].unique(),reverse = True):
                i = sorted(test[name].unique(),reverse = True).index(key)
                if key in comp_dict[filename].index:
                    for col in tempdict.keys():
                        temp.loc[i,name] = key
                        temp.loc[i,col] = comp_dict[filename].drop('weight',axis = 1).loc[key][col]
        
        # Dropping any pure NaN columns
        tempcol = temp.columns.copy()
        dropcol = []
        for x in tempcol:
            if sum(pd.isnull(temp[x])) == len(temp[x]):
                dropcol.append(x)
        temp = temp.drop(dropcol,axis=1)
        
        # rename columns for number
        c = name.split('_')[-1]
        temp.columns = [temp.columns[i] + '_comp' + c if i > 0 else temp.columns[i] for i in range(len(temp.columns))]
        return temp
    
def mergeframetest(comp_num):
    global test
    
    # Call comp_column()
    comp = 'component_id_'+str(comp_num)
    merge = comp_columnTest(comp)
       
    test = pd.merge(test,merge, how = 'left', on = comp)
    startIndex = list(test.columns).index('weight_id_'+str(comp_num))
    x = list(test.columns.copy())
    newColOrder = []
    for i in range(len(x)):
        if i <= startIndex:
            newColOrder += [x[i]]
        elif i > startIndex and i < startIndex+len(merge.columns):
            newColOrder += [x[-len(merge.columns)+i-startIndex]]
        else:
            newColOrder += [x[i-len(merge.columns)+1]]
    return test[newColOrder]       
            

            
# comp_column('component_id_7')
#
# comp_columnTest('component_id_6')

In [475]:
for i in range(1,7):
    test = mergeframetest(i)
    
for i in range(1,9):
    train = mergeframe(i)

In [476]:
print '# of train columns:', len(train.columns)
print '# of test columns:',  len(test.columns)

# of train columns: 310
# of test columns: 312


**Checking for full null columns in test**

Any fully null column will be dropped

In [477]:
print 'test'
droptestcol = []
for x in test.columns:
    if sum(pd.isnull(test[x])) == len(test[x]):
        droptestcol += [x]
        print x
print
print 'train'
droptraincol = []
for x in train.columns:
    if sum(pd.isnull(train[x])) == len(train[x]):
        droptraincol += [x]
        print x
test = test.drop(droptestcol, axis = 1)
train = train.drop(droptraincol, axis = 1)
print '# of train columns:', len(train.columns)
print '# of test columns:',  len(test.columns)

test
component_id_7
quantity_7
component_id_8
quantity_8

train
# of train columns: 310
# of test columns: 308


**TUBE_END_FORM** incorporation

In [596]:
def tube_end(col):
    global train
    train2 = train.copy()
    re = rest_files['tube_end_form'].copy()
    tem = []
    if col not in ['end_a','end_x']:
        return 'Error'
    else:
        for i in train2[col]:
            if i == 'NONE':
                tem += [np.nan]
            else:
                t = re[re['end_form_id'] == i]['forming']
                tem += [t.reset_index()['forming'][0]]
    train[col] = tem
    return train

def tube_endTest(col):
    global test
    test2 = test.copy()
    re = rest_files['tube_end_form'].copy()
    tem = []
    if col not in ['end_a','end_x']:
        return 'Error'
    else:
        for i in test2[col]:
            if i == 'NONE':
                tem += [np.nan]
            else:
                t = re[re['end_form_id'] == i]['forming']
                tem += [t.reset_index()['forming'][0]]
    test[col] = tem
    return test


train = tube_end('end_a')
train = tube_end('end_x')

test = tube_endTest('end_a')
test = tube_endTest('end_x')

**TYPE_END_FORM** incorporation

In [661]:
endformtype = train.columns[train.columns.str.contains('end_form')]
endformtypeTest = test.columns[test.columns.str.contains('end_form')]

def type_end(col):
    train2 = train.copy()
    re = rest_files['type_end_form'].copy()
    typeEnd = []
    if col not in endformtype:
        return 'Error'
    else:
        for i in train2[col]:
            if i == 'NONE' or i == np.nan or pd.isnull(i):
                typeEnd += [np.nan]
            else:
                t = re[re['end_form_id'] == i]['name']
                typeEnd += [t.reset_index()['name'][0]]
    return typeEnd


def type_endTest(col):
    test2 = test.copy()
    re = rest_files['type_end_form'].copy()
    typeEnd = []
    if col not in endformtypeTest:
        return 'Error'
    else:
        for i in test2[col]:
            if i == 'NONE' or i == np.nan or pd.isnull(i):
                typeEnd += [np.nan]
            else:
                t = re[re['end_form_id'] == i]['name']
                typeEnd += [t.reset_index()['name'][0]]
    return typeEnd


for i in endformtype:
    train[i] = type_end(i)
for i in endformtypeTest:
    test[i] = type_endTest(i)

IndexError: index out of bounds

In [665]:
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)

In [600]:
test[['end_a_1x','end_a_2x','end_x_1x','end_x_2x','end_a','end_x']].head()

Unnamed: 0,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x
0,N,N,N,N,No,No
1,N,N,N,N,No,No
2,N,N,N,N,No,No
3,N,N,N,N,No,No
4,N,N,N,N,No,No


In [603]:
rest_files.keys()

['tube_end_form',
 'type_connection',
 'components',
 'type_end_form',
 'specs',
 'type_component']

In [604]:
rest_files['type_end_form'].head()

Unnamed: 0,end_form_id,name
0,A-001,Male (Stud)
1,A-002,Male (Swivel)
2,A-003,Braze-Weld Boss
3,A-004,Braze-Weld Socket
4,A-005,Swivel Nut


In [663]:
for i in range(len(endformtypeTest)):
    print sum(pd.notnull(test[endformtypeTest[i]]))


7683
7683
8
948
948
12
5210
5210
18
85
85
5
11
11
6
6


In [656]:
for i in range(len(endformtype)):
    print sum(pd.notnull(train[endformtype[i]]))

7630
7630
1058
1058
16
5320
5320
4
25
49
49
4
5
5
8
8


## Test mergeframe

In [329]:
comp_dict.keys()

['comp_sleeve',
 'comp_nut',
 'comp_adaptor',
 'comp_elbow',
 'comp_other',
 'comp_threaded',
 'comp_hfl',
 'comp_straight',
 'comp_boss',
 'comp_float',
 'comp_tee']

In [313]:
def mergeframeTEST(comp_num):
    train2 = copy.deepcopy(train)
    
    # Call comp_column()
    comp = 'component_id_'+str(comp_num)
    merge = comp_column(comp)
       
    train2 = pd.merge(train2,merge, how = 'left', on = comp)
    startIndex = list(train2.columns).index('weight_id_'+str(comp_num))
    x = list(train2.columns.copy())
    newColOrder = []
    for i in range(len(x)):
        if i <= startIndex:
            newColOrder += [x[i]]
        elif i > startIndex and i < startIndex+len(merge.columns):
            newColOrder += [x[-len(merge.columns)+i-startIndex]]
        else:
            newColOrder += [x[i-len(merge.columns)+1]]
    return newColOrder 

In [305]:
train2 = copy.deepcopy(train)
merge = comp_column('component_id_6')
print len(merge.columns)
train2 = pd.merge(train2,merge, how = 'left', on = 'component_id_6')
t = list(train2.columns).index('weight_id_6')
x = list(train2.columns.copy())
l = []
for i in range(len(x)):
    if i <= t:
        l += [x[i]]
    elif i > t and i < t+len(merge.columns):
        l += [x[-len(merge.columns)+i-t]]
    else:
        l += [x[i-len(merge.columns)+1]]

print len(x)
print len(l)
l, x
# [x[i] if i < t and i > t+len(merge.columns) else x[-1*(len(merge.columns)-i)] for i in range(len(x))]

21
weight_id_6
46
67
67


(['tube_assembly_id',
  'supplier',
  'quote_date',
  'annual_usage',
  'min_order_quantity',
  'bracket_pricing',
  'quantity',
  'cost',
  'component_id_1',
  'quantity_1',
  'weight_id_1',
  'component_id_2',
  'quantity_2',
  'weight_id_2',
  'component_id_3',
  'quantity_3',
  'weight_id_3',
  'component_id_4',
  'quantity_4',
  'weight_id_4',
  'component_id_5',
  'quantity_5',
  'weight_id_5',
  'component_id_6',
  'quantity_6',
  'weight_id_6',
  'part_name_comp6',
  'orientation_comp6',
  'nominal_size_2_comp6',
  'adaptor_angle_comp6',
  'component_type_id_comp6',
  'connection_type_id_1_comp6',
  'thread_size_1_comp6',
  'end_form_id_1_comp6',
  'end_form_id_2_comp6',
  'unique_feature_comp6',
  'length_1_comp6',
  'length_2_comp6',
  'thread_pitch_1_comp6',
  'height_over_tube_comp6',
  'groove_comp6',
  'connection_type_id_comp6',
  'base_type_comp6',
  'outside_shape_comp6',
  'type_comp6',
  'base_diameter_comp6',
  'component_id_7',
  'quantity_7',
  'weight_id_7',
  'c

### Examining the merge process for different `component_id_*`

I explored `component_id_6,7,8` and they seem to work well

In [253]:
train2 = copy.deepcopy(train)
print len(train2.iloc[:,0])
train2 = pd.merge(train2,comp_column('component_id_7'), how = 'left', on = 'component_id_7')
print train2[pd.notnull(train['component_id_7'])].iloc[:,-10:]
print len(train2.iloc[:,0])

30213
      end_a_1x end_a_2x end_x_1x end_x_2x   end_a   end_x  num_boss  \
6179         N        N        N        N  EF-021  EF-021         0   
6180         N        N        N        N  EF-021  EF-021         0   
6181         N        N        N        N  EF-021  EF-021         0   
6533         N        N        Y        Y  EF-023  EF-023         0   
17706        N        N        Y        Y  EF-023  EF-023         1   
17707        N        N        Y        Y  EF-023  EF-023         1   
17708        N        N        Y        Y  EF-023  EF-023         1   
18896        N        Y        N        Y  EF-023  EF-023         0   

       num_bracket  other part_name_c7  
6179             0      0        PLATE  
6180             0      0        PLATE  
6181             0      0        PLATE  
6533             0      6        PLATE  
17706            0      2        PLATE  
17707            0      2        PLATE  
17708            0      2        PLATE  
18896            0      8 

### Testing a function to see how each `component_id_*` gets a data frame associated with it

Looking to build a function `comp_column(name)` that will take care of this for me. This is my scratch work for testing the function and seeing if it is working properly. Combining dictionaries didn't work well because the alignment between different columns is off - especially when in `component_id_6` C-2005 doesn't have a `part_name`, but it gets assigned one when concatenating the dictionaries together, which is bad. So I opted for a bruteforce approach of building each row one by one (3 nested for-loops, ugly).

In [250]:
test = pd.DataFrame({'component_id_6': []})
print sorted(train['component_id_6'].unique(),reverse = True)
for filename in comp_dict:
    tempdict = {x: [] for x in comp_dict[filename].drop('weight',axis = 1).columns}
    for key in sorted(train['component_id_6'].unique(),reverse = True):
        i = sorted(train['component_id_6'].unique(),reverse = True).index(key)
        if key in comp_dict[filename].index:
            for col in tempdict.keys():
                # test[i,col] = comp_dict[filename].drop('weight',axis = 1).loc[key][col]
                # tempdict[col].append(comp_dict[filename].drop('weight',axis = 1).loc[key][col])
                test.loc[i,'component_id_6'] = key
                test.loc[i,col] = comp_dict[filename].drop('weight',axis = 1).loc[key][col]
                print key, col, comp_dict[filename].drop('weight',axis = 1).loc[key][col]
    #if len(tempdict[tempdict.keys()[0]])>0:
        #tempframe = pd.DataFrame(tempdict)   
        #test = pd.concat([test,tempframe],axis=1)
                # .append(comp_dict[filename].loc[key][col])
# train2 = pd.DataFrame({key1 : sorted(train[key1].unique(),reverse = True),weight_id[i]: weight})
# print tempdict
# print test
tempcol = test.columns.copy()
drop = []
for x in tempcol:
    if sum(pd.isnull(test[x])) == len(test[x]):
        drop.append(x)
test = test.drop(drop,axis=1)
c = 'component_id_6'.split('_')[-1]
test.columns = [test.columns[i] + '_c' + c if i > 0 else test.columns[i] for i in range(len(test.columns))]
print test.iloc[:,:10]
print test.iloc[:,10:]

['C-2005', 'C-1920', 'C-1560', 'C-0981', 'C-0967', 'C-0965', 'C-0934', 'C-0663', 'C-0378', 'C-0353', nan]
C-1920 part_name PLATE
C-1560 part_name FLANGE
C-0981 part_name PLATE
C-0967 part_name BRACKET
C-0934 part_name PLATE
C-0663 part_name TUBE
C-0378 part_name VALVE BODY
C-0353 part_name PIPE
C-2005 orientation Yes
C-2005 nominal_size_3 nan
C-2005 nominal_size_2 19.05
C-2005 nominal_size_1 nan
C-2005 nominal_size_4 nan
C-2005 adaptor_angle 90.0
C-2005 component_type_id CP-015
C-2005 hex_size nan
C-2005 thread_size_4 nan
C-2005 connection_type_id_1 B-002
C-2005 connection_type_id_2 nan
C-2005 connection_type_id_3 nan
C-2005 connection_type_id_4 nan
C-2005 thread_size_1 1.187
C-2005 thread_size_2 nan
C-2005 thread_size_3 nan
C-2005 end_form_id_4 nan
C-2005 end_form_id_1 A-001
C-2005 end_form_id_2 A-004
C-2005 end_form_id_3 nan
C-2005 overall_length nan
C-2005 unique_feature Yes
C-2005 length_1 37.3
C-2005 length_3 nan
C-2005 length_2 37.3
C-2005 length_4 nan
C-2005 thread_pitch_4 nan
C

### testing if the merge works properly

testing this script for adding the `weight` column

In [100]:
i = comp_id.index('component_id_6')
print sorted(train['component_id_6'].unique(), reverse = True)
weight = []
for key2 in sorted(train['component_id_6'].unique(), reverse = True):
    for filename in comp_dict:
        if key2 in comp_dict[filename].index:
            weight.append(comp_dict[filename].loc[key2]['weight'])
            print filename, key2, comp_dict[filename].loc[key2]['weight']
            print 
else:
    weight.append(np.nan)
train3 = pd.DataFrame({'component_id_6': sorted(train['component_id_6'].unique(), reverse = True),
                       'weight': weight})
train3.head(20)


['C-2005', 'C-1920', 'C-1560', 'C-0981', 'C-0967', 'C-0965', 'C-0934', 'C-0663', 'C-0378', 'C-0353', nan]
comp_threaded C-2005 0.326

comp_other C-1920 0.15

comp_other C-1560 6.9

comp_other C-0981 0.524

comp_other C-0967 1.138

comp_boss C-0965 0.5

comp_other C-0934 0.701

comp_other C-0663 3.771

comp_other C-0378 0.094

comp_other C-0353 0.326



Unnamed: 0,component_id_6,weight
0,C-2005,0.326
1,C-1920,0.15
2,C-1560,6.9
3,C-0981,0.524
4,C-0967,1.138
5,C-0965,0.5
6,C-0934,0.701
7,C-0663,3.771
8,C-0378,0.094
9,C-0353,0.326


In [438]:
# testing on test

test2 = test.copy()
print test2.columns
comp_id = ['component_id_'+str(i) for i in range(1,9)]
weight_id = ['weight_id_'+str(i) for i in range(1,9)]
temp = pd.DataFrame({'component_id_6': []})
for filename in comp_dict:
    for key2 in sorted(test2['component_id_6'].unique(),reverse = True):
        i = sorted(test2['component_id_6'].unique(),reverse = True).index(key2)
        if key2 in comp_dict[filename].index:
            print filename, key2
            temp.loc[i,'component_id_6'] = key2
            temp.loc[i,weight_id[5]] = comp_dict[filename].loc[key2]['weight']
test2 = pd.merge(test2,temp,how = 'left',on = 'component_id_6')

print test2.columns
print temp

Index([u'id', u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity',
       u'component_id_1', u'quantity_1', u'component_id_2', u'quantity_2',
       u'component_id_3', u'quantity_3', u'component_id_4', u'quantity_4',
       u'component_id_5', u'quantity_5', u'component_id_6', u'quantity_6',
       u'component_id_7', u'quantity_7', u'component_id_8', u'quantity_8',
       u'material_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'end_a_1x', u'end_a_2x', u'end_x_1x', u'end_x_2x',
       u'end_a', u'end_x', u'num_boss', u'num_bracket', u'other'],
      dtype='object')
comp_other C-1049
comp_other C-0557
comp_other C-0279
comp_other C-0176
comp_threaded C-1639
comp_straight C-2021
comp_boss C-1065
comp_boss C-0965
Index([u'id', u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity',
       u'component_id_1', u'quan

### Examining Jingyi's files

In [22]:
training1 = pd.read_csv("../train.csv", index_col = 0, header = 0)
trainning2 = pd.read_csv("../train4.csv", index_col = 0, header = 0)
print training2.columns

Index([u'tube_assembly_id', u'supplier', u'bracket_pricing', u'quantity',
       u'cost', u'component_id_1_name', u'component_1_weight', u'quantity_1',
       u'component_id_2_name', u'component_2_weight', u'quantity_2',
       u'component_id_3_name', u'component_3_weight', u'quantity_3',
       u'material_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'end_a_2x', u'end_x_2x', u'end_a', u'end_x', u'year',
       u'month', u'day'],
      dtype='object')
