In [None]:
# %load dataclean.py
import os, glob
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

class PruneLabelEncoder(LabelEncoder):
    def __init___(self):
        super(PruneLabelEncoder, self).__init__()
    def fit(self, series, cutoff=10):
        self.cutoff = cutoff
        # Generate the transformation classes and also the map for low output munging
        super(PruneLabelEncoder, self).fit(series)
        trans_series = super(PruneLabelEncoder, self).transform(series)
        self.val_count_map = defaultdict(int)
        for i in trans_series:
            self.val_count_map[i] += 1
        # identify the first key with low frequency and use it for all low freq vals
        for key, val in self.val_count_map.items():
            if val < self.cutoff:
                self.low_cnt_target = key
                break
    def transform(self, series):
        trans_series = super(PruneLabelEncoder, self).transform(series)
        # Transform all the low frequency keys into the low frequency target key
        for key, val in self.val_count_map.items():
            if val < self.cutoff:
                trans_series[trans_series==key] = self.low_cnt_target
        return trans_series


def whichcolumns(df, percentnull = 0.9):
# Returns a list of which columns have greater than or equal to 90% null values
    col = []
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            col += [x]
    return col


def dropcolumns(df, percentnull = 0.9):
# function to drop columns with 90% NaN values or user can specify percentage
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            df = df.drop(x,axis = 1)
    return df



def encode(df, TRANSFORM_CUTOFF):
    '''
    Takes in a dataframe and a cutoff value for bucketing encoding values

    If the frequency of an encoded value is below the cutoff, it will bucket 
    everything to the first value it encounters that is below the cutoff value
    '''
    temp = df.copy()

    # Checking if there are 2 or more unique values in each column
    for x in df.columns:
        if len(df[x].unique()) < 2:
            return 'Error: Fewer than 2 unique values in a column'

    for col in df.columns:
        if type(df[col].unique()[1]) == str:
            le = PruneLabelEncoder()
            le.fit(df[col],TRANSFORM_CUTOFF)
            df[col] = le.transform(df[col])

    return df


In [3]:
traintest = pd.read_csv('traintest.csv', header = 0)

  data = self._reader.read(nrows)


In [4]:
print "First index:", list(traintest.columns).index('component_id_1')
print "Last index:", list(traintest.columns).index('part_name_comp8')

First index: 10
Last index: 319


In [7]:
col999 = whichcolumns(traintest, percentnull = 0.999)
print col999

['blind_hole_comp1', 'diameter_comp1', 'mj_plug_class_code_comp1', 'plug_diameter_comp1', 'connection_type_id_3_comp1', 'thread_size_3_comp1', 'end_form_id_3_comp1', 'length_3_comp1', 'thread_pitch_3_comp1', 'blind_hole_comp2', 'diameter_comp2', 'mj_plug_class_code_comp2', 'plug_diameter_comp2', 'nominal_size_3_comp2', 'connection_type_id_3_comp2', 'thread_size_3_comp2', 'end_form_id_3_comp2', 'length_3_comp2', 'thread_pitch_3_comp2', 'blind_hole_comp3', 'diameter_comp3', 'seat_angle_comp3', 'mj_plug_class_code_comp3', 'plug_diameter_comp3', 'nominal_size_3_comp3', 'thread_size_4_comp3', 'connection_type_id_3_comp3', 'connection_type_id_4_comp3', 'thread_size_3_comp3', 'end_form_id_4_comp3', 'end_form_id_3_comp3', 'length_3_comp3', 'length_4_comp3', 'thread_pitch_4_comp3', 'thread_pitch_3_comp3', 'hose_diameter_comp3', 'material_comp3', 'corresponding_shell_comp3', 'coupling_class_comp3', 'head_diameter_comp3', 'base_diameter_comp3', 'shoulder_diameter_comp3', 'thread_pitch_comp4', 'he

In [8]:
len(col999)

144

In [9]:
drop99 = dropcolumns(traintest, percentnull = 0.99)
list(drop99.columns)

['tube_assembly_id',
 'supplier',
 'year',
 'month',
 'day',
 'annual_usage',
 'min_order_quantity',
 'bracket_pricing',
 'quantity',
 'cost',
 'component_id_1',
 'quantity_1',
 'weight_id_1',
 'component_type_id_comp1',
 'plating_comp1',
 'orientation_comp1',
 'intended_nut_thread_comp1',
 'unique_feature_comp1',
 'connection_type_id_comp1',
 'intended_nut_pitch_comp1',
 'length_comp1',
 'seat_angle_comp1',
 'thread_pitch_comp1',
 'hex_nut_size_comp1',
 'thread_size_comp1',
 'end_form_id_2_comp1',
 'overall_length_comp1',
 'end_form_id_1_comp1',
 'nominal_size_2_comp1',
 'nominal_size_1_comp1',
 'thread_pitch_1_comp1',
 'thread_pitch_2_comp1',
 'connection_type_id_1_comp1',
 'connection_type_id_2_comp1',
 'thread_size_1_comp1',
 'thread_size_2_comp1',
 'hex_size_comp1',
 'bolt_pattern_wide_comp1',
 'thickness_comp1',
 'bolt_pattern_long_comp1',
 'groove_comp1',
 'part_name_comp1',
 'height_over_tube_comp1',
 'base_type_comp1',
 'outside_shape_comp1',
 'type_comp1',
 'component_id_2',


In [10]:
comp_id = ['component_id_' + str(i) for i in range(1,5)]
comp_id

['component_id_1', 'component_id_2', 'component_id_3', 'component_id_4']

In [16]:
comp_type_id = list(drop99.columns[drop99.columns.str.contains('component_type')])
comp_type_id

['component_type_id_comp1',
 'component_type_id_comp2',
 'component_type_id_comp3',
 'component_type_id_comp4']

In [19]:
drop99[comp_id[0]].unique()

array(['NUT-FLARED', 'NUT-SWIVEL', 'BOSS', 'COLLAR', 'NUT-INV FLARED',
       'NUT', 'ELBOW', nan, 'ADAPTER-O SUPPL', 'FLANGE', 'ADAPTER',
       'ADAPTER-O LINE', 'SLEEVE-FITTING', 'CONNECTOR-SEAL', 'FITTING-A/C',
       'HEAD-FLANGED', 'NUT-A/C', 'FLANGE-ONE PIEC', 'CONNECTOR-SPL',
       'FITTING-NUT', 'PLATE', 'FLANGE-BRAZE', 'ELBOW-90 DEG',
       'ADAPTER-O DRAIN', 'PLUG', 'STUD-WELD', 'CONNECTOR',
       'ADAPTER-DRAIN', 'WASHER', 'CONNECTOR-WELD', 'ELBOW-WATER',
       'ELBOW-AFTCLR', 'ADAPTER-STR', 'COLLAR-O-RING', 'FITTING',
       'COUPLING-PIPE', 'SCREEN', 'LUG', 'CLIP', 'BOSS-OUTLET',
       'SLEEVE-ORFS', 'FLANGE-ONE PIECE', 'BOSS-FITTING', 'COUPLET-WELD',
       'BLOCK', 'MANIFOLD', 'TUBE', 'NUT-WELD', 'ADAPTER-INLET',
       'CONNECTOR-FLARE', 'SEAL-O-RING', 'BAR', 'HEAD-FLANGE',
       'ELBOW-HYDRAULIC', 'COUPLING AS', 'TUBE AS-O SUPPL',
       'WASHER-FUEL LIN', 'NUT-45 (DEG)', 'TUBE AS', 'FLANGE-WELD',
       'CAP-A/C', 'COUPLET', 'SCREEN AS.', 'CLEAT', 'ORIFICE',
  

In [20]:
drop99[comp_type_id[0]].unique()

array(['Straight Adapter', 'ORFS Nut', 'Threaded Boss', nan,
       'JIC 37 - 45 Nut', '2-bolt Braze/Weld Elbow',
       '2-bolt Braze/Weld Straight', 'Sleeves', 'Threaded Straight',
       'MJ Flange Head', '4-bolt Braze/Weld Straight',
       '4-bolt Braze/Weld Elbow', 'Threaded Elbow', 'MJ Plug',
       '4-bolt Tig Straight', '4-bolt MJ Straight', '2-bolt Boss',
       'Weld On Nut', 'Braze/Weld Flange Head', '2-bolt MJ Straight',
       '4-bolt Tig Elbow', '4-bolt MJ Elbow',
       'Free-Floating 4-bolt Captive Flange', '4-bolt Boss',
       'Free-Floating 4-bolt Spacer', 'Threaded Tee'], dtype=object)

In [23]:
drop99 = drop99.drop('tube_assembly_id',axis = 1)

In [27]:
drop99 = encode(drop99, 10)

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [30]:
drop99[comp_id].head()

Unnamed: 0,component_id_1,component_id_2,component_id_3,component_id_4
0,105,0,0,0
1,105,0,0,0
2,105,0,0,0
3,105,0,0,0
4,105,0,0,0


In [32]:
drop99.to_csv('traintest99drop.csv', index = False)

In [33]:
drop999 = dropcolumns(traintest, percentnull = 0.999)
drop999 = drop999.drop('tube_assembly_id',axis = 1)

In [35]:
len(drop999.columns)

200

In [37]:
drop999.to_csv('traintest999drop.csv',index = False)

In [38]:
# drop 95% or more NULL values and save to csv
drop95 = dropcolumns(traintest, percentnull = 0.95)
drop95 = drop95.drop('tube_assembly_id',axis = 1)
drop95.to_csv('traintest95drop.csv',axis = 1)

In [39]:
# drop 90% or more NULL values and save to csv
drop90 = dropcolumns(traintest, percentnull = 0.90)
drop90 = drop90.drop('tube_assembly_id',axis = 1)
drop90.to_csv('traintest90drop.csv',axis = 1)

In [43]:
print "id1:", list(traintest.columns).index('component_id_1')
print "id2:", list(traintest.columns).index('component_id_2')
print "id3:", list(traintest.columns).index('component_id_3')
print "id4:", list(traintest.columns).index('component_id_4')
print "id5:", list(traintest.columns).index('component_id_5')
print "id6:", list(traintest.columns).index('component_id_6')
print "id7:", list(traintest.columns).index('component_id_7')
print "id8:", list(traintest.columns).index('component_id_8')

id1: 10
id2: 65
id3: 121
id4: 186
id5: 242
id6: 283
id7: 312
id8: 316


In [52]:
# Create a dictionary of Series for each component 1,2,3,4,5,6,7 features
# Each Series shows the percentage of null values in each corresponding column in traintest

traintest = pd.read_csv('traintest.csv', header = 0)
comp1 = traintest.iloc[:,10:65]
comp2 = traintest.iloc[:,65:121]
comp3 = traintest.iloc[:,121:186]
comp4 = traintest.iloc[:,186:242]
comp5 = traintest.iloc[:,242:283]
comp6 = traintest.iloc[:,283:312]
comp7 = traintest.iloc[:,312:316]

dfcomp = [comp1,comp2,comp3,comp4,comp5,comp6,comp7]
component = ['comp'+str(i) for i in range(1,8)]
compdict = {}
for i in range(len(component)):
    compdict[component[i]] = dfcomp[i]



df = {}
for comp in component:
    tempdict = {}
    for x in compdict[comp].columns:
        tempdict[x] = np.mean(pd.isnull(compdict[comp][x]))
    tempdict = pd.Series(tempdict)
    df[comp] = tempdict
    

In [53]:
df.keys()

['comp2', 'comp3', 'comp1', 'comp6', 'comp7', 'comp4', 'comp5']

In [56]:
df['comp1']

adaptor_angle_comp1           0.993862
base_diameter_comp1           0.994326
base_type_comp1               0.989479
blind_hole_comp1              0.999090
bolt_pattern_long_comp1       0.974044
bolt_pattern_wide_comp1       0.985872
component_id_1                0.048223
component_type_id_comp1       0.081674
connection_type_id_1_comp1    0.750298
connection_type_id_2_comp1    0.820391
connection_type_id_3_comp1    0.999868
connection_type_id_comp1      0.956971
diameter_comp1                0.999090
drop_length_comp1             0.992738
elbow_angle_comp1             0.992754
end_form_id_1_comp1           0.746675
end_form_id_2_comp1           0.746675
end_form_id_3_comp1           0.999868
extension_length_comp1        0.992738
groove_comp1                  0.955731
head_diameter_comp1           0.992655
height_over_tube_comp1        0.989214
hex_nut_size_comp1            0.412652
hex_size_comp1                0.753937
intended_nut_pitch_comp1      0.967757
intended_nut_thread_comp1

In [47]:
np.mean(pd.isnull(compdict['comp1']['weight_id_1']))

0.050539306511381686

In [49]:
pd.Series(tempdict)

adaptor_angle_comp1           0.993862
base_diameter_comp1           0.994326
base_type_comp1               0.989479
blind_hole_comp1              0.999090
bolt_pattern_long_comp1       0.974044
bolt_pattern_wide_comp1       0.985872
component_id_1                0.048223
component_type_id_comp1       0.081674
connection_type_id_1_comp1    0.750298
connection_type_id_2_comp1    0.820391
connection_type_id_3_comp1    0.999868
connection_type_id_comp1      0.956971
diameter_comp1                0.999090
drop_length_comp1             0.992738
elbow_angle_comp1             0.992754
end_form_id_1_comp1           0.746675
end_form_id_2_comp1           0.746675
end_form_id_3_comp1           0.999868
extension_length_comp1        0.992738
groove_comp1                  0.955731
head_diameter_comp1           0.992655
height_over_tube_comp1        0.989214
hex_nut_size_comp1            0.412652
hex_size_comp1                0.753937
intended_nut_pitch_comp1      0.967757
intended_nut_thread_comp1