In [256]:
%pylab inline

from collections import defaultdict
import pandas as pd

from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


In [257]:
comp_types, group_dfs, cluster_dfs = load_raw_components()
group_dfs['other'].columns

Index([u'component_id', u'part_name', u'weight'], dtype='object')

In [192]:
# Look for columns that occur in multiple groups:

col_to_groups = defaultdict(list)
for group_name, df in group_dfs.iteritems():
    for col in df.columns:
        col_to_groups[col].append(group_name)

col_groups = col_to_groups.items()
col_groups.sort(key=lambda (col, groups): len(groups), reverse=True)
print len(group_dfs)
for col, groups in col_groups:
    print col, len(groups), groups

11
weight 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
component_id 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
orientation 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
component_type_id 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
unique_feature 7 ['threaded', 'sleeve', 'adaptor', 'tee', 'boss', 'elbow', 'straight']
bolt_pattern_wide 5 ['float', 'tee', 'boss', 'elbow', 'straight']
overall_length 5 ['threaded', 'adaptor', 'tee', 'elbow', 'straight']
bolt_pattern_long 5 ['float', 'tee', 'boss', 'elbow', 'straight']
thickness 4 ['float', 'tee', 'elbow', 'straight']
groove 4 ['tee', 'boss', 'elbow', 'straight']
mj_class_code 3 ['tee', 'elbow', 'straight']
nominal_size_2 2 ['threaded', 'adaptor']
nominal_size_1 2 ['threaded', 'adaptor']
adaptor_angle 2 ['threaded', 'adap

In [258]:
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)
cinfo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2047 entries, 0 to 2046
Data columns (total 48 columns):
adaptor_angle          55 non-null float64
base_diameter          57 non-null float64
base_type              124 non-null object
blind_hole             23 non-null object
bolt_pattern_long      2047 non-null float64
bolt_pattern_wide      2047 non-null float64
component_group_id     2047 non-null object
component_id           2047 non-null object
component_type_id      2047 non-null object
corresponding_shell    6 non-null object
coupling_class         6 non-null object
diameter               23 non-null float64
drop_length            175 non-null float64
elbow_angle            130 non-null float64
extension_length       174 non-null float64
groove                 2047 non-null bool
head_diameter          70 non-null float64
height_over_tube       147 non-null float64
hex_nut_size           42 non-null float64
hex_size               129 non-null float64
hose_diameter          6 no

In [195]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 13.6 s, sys: 72 ms, total: 13.7 s
Wall time: 13.9 s


In [211]:
from soln.utils import count_components
train_counts = count_components(aug_train_set, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(aug_test_set, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

In [212]:
all_counts

Unnamed: 0,component_id,component_type_id,component_group_id,train_count,test_count
0,C-0007,CP-014,threaded,13,7
1,C-0030,CP-015,threaded,0,0
2,C-0041,CP-014,threaded,0,0
3,C-0043,CP-014,threaded,0,0
4,C-0044,CP-014,threaded,0,0
5,C-0069,CP-015,threaded,0,0
6,C-0070,CP-015,threaded,0,0
7,C-0072,CP-015,threaded,0,0
8,C-0073,CP-015,threaded,0,0
9,C-0074,CP-014,threaded,0,0


In [214]:
print len(all_counts)
print len(all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)])
print len(all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)])

2047
520
482


In [223]:
bins = [(0, 0), (1, 1), (2, 5), (5, 10), (10, 20), (20, 50), (50, 100), (100, np.inf)]
all_counts['train_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.train_bin.loc[(all_counts.train_count >= cmin) & (all_counts.train_count <= cmax)] = i
all_counts['test_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.test_bin.loc[(all_counts.test_count >= cmin) & (all_counts.test_count <= cmax)] = i

In [240]:
grouped = all_counts.groupby(['train_bin', 'test_bin'])
df = grouped.size().unstack()
df

test_bin,0,1,2,3,4,5,6,7
train_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,346.0,399.0,81.0,2.0,,,,
1,407.0,140.0,83.0,9.0,,,,
2,111.0,94.0,112.0,38.0,2.0,,,
3,2.0,5.0,43.0,37.0,8.0,,,
4,,,,13.0,41.0,2.0,,
5,,,,,9.0,23.0,1.0,
6,,,,,,3.0,6.0,
7,,,,,,,,30.0


In [242]:
in_train_not_test = all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)]
print in_train_not_test.component_group_id.value_counts()
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print in_test_not_train.component_group_id.value_counts()

other       313
straight     70
elbow        36
boss         36
threaded     31
adaptor      13
nut           9
float         7
hfl           3
sleeve        1
tee           1
dtype: int64
other       301
straight     77
elbow        35
boss         27
threaded     20
nut           8
sleeve        4
adaptor       4
hfl           2
tee           2
float         2
dtype: int64


In [243]:
df_other = group_dfs['other']
df_other.part_name.value_counts()

FLANGE              158
PLATE                94
TUBE                 91
ADAPTER              53
BOSS                 41
ELBOW                34
BLOCK                27
BRACKET              27
TUBE AS              27
FITTING              12
CLIP                 10
ORIFICE              10
CONNECTOR-WELD        9
NUT-WELD              8
TUBE AS.              8
WASHER                8
HEAD-FLANGED          8
NUT-A/C               7
CONNECTOR             7
ADAPTER-OIL LIN       6
NUT                   6
LUG                   6
SPACER                6
COUPLING AS           6
ELBOW-HYDRAULIC       6
SEAL-O-RING           6
PIPE                  5
RING                  5
COUPLING              5
NUT-FUEL INJ          5
                   ... 
PLUG-PIPE             1
TUBE AS-RH            1
TUBE AS-O SUPPL       1
ADAPTER RING          1
SHEET                 1
COUPLING-PIPE         1
SLEEVE-REDUCING       1
HASP AS               1
BLOCK CONNECTION      1
ELBOW-AIR (CRS)       1
CAP-ORFS        

In [260]:
cinfo_df.cluster.value_counts(dropna=False)

NaN                1686
straight_clu_16      45
straight_clu_13      36
straight_clu_23      29
straight_clu_1       26
straight_clu_15      26
straight_clu_12      25
straight_clu_10      25
straight_clu_21      25
straight_clu_20      19
straight_clu_6       17
straight_clu_19      13
straight_clu_2        9
straight_clu_17       8
straight_clu_25       8
straight_clu_24       8
straight_clu_3        8
straight_clu_18       7
straight_clu_14       7
straight_clu_11       5
straight_clu_5        5
straight_clu_4        4
straight_clu_8        2
straight_clu_9        2
straight_clu_7        1
straight_clu_22       1
dtype: int64