In [1]:
%matplotlib inline
import os, glob
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
from IPython.display import display

In [49]:
train = pd.read_csv(os.path.join('..','notebooks','traintest.csv'),header=0)
#train.append(test, ignore_index = True)
names =  train.columns.values
#print names
#train.index

In [157]:
class PruneLabelEncoder(LabelEncoder):
    def __init___(self):
        super(PruneLabelEncoder, self).__init__()
    def fit(self, series, cutoff=10):
        self.cutoff = cutoff
        # Generate the transformation classes and also the map for low output munging 
        super(PruneLabelEncoder, self).fit(series)
        trans_series = super(PruneLabelEncoder, self).transform(series)
        self.val_count_map = defaultdict(int)
        for i in trans_series:
            self.val_count_map[i] += 1
        # identify the first key with low frequency and use it for all low freq vals
        for key, val in self.val_count_map.items():
            if val < self.cutoff:
                self.low_cnt_target = key
                break
    def transform(self, series):
        trans_series = super(PruneLabelEncoder, self).transform(series)
        # Transform all the low frequency keys into the low frequency target key
        for key, val in self.val_count_map.items():
            if val < self.cutoff:
                trans_series[trans_series==key] = self.low_cnt_target
        return trans_series
    
def whichcolumns(df, percentnull = 0.9):
# Returns a list of which columns have greater than or equal to 90% null values
    col = []
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            col += [x]
    return col
    
    
def dropcolumns(df, percentnull = 0.9):
# function to drop columns with 90% NaN values or user can specify percentage
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            df = df.drop(x,axis = 1)
    return df

def encode(df, TRANSFORM_CUTOFF):
    temp = df.copy()

    # Checking if there are 2 or more unique values in each column
    for x in train.columns:
        if len(train[x].unique()) < 2:
            return 'Error: Fewer than 2 unique values in a column'
    
    for col in df.columns:
        if type(df[col].unique()[1]) == str:
            le = PruneLabelEncoder()
            le.fit(df[col],TRANSFORM_CUTOFF)
            df[col] = le.transform(df[col])
            
    return df


In [141]:


c = train.loc[:,['component_id_3','cost','component_id_1','weight_id_1','bracket_pricing']].copy()
train = encode(train, 1)

In [139]:
c = train['nominal_size_1_comp1'].copy()
print "# of 'See Drawing' observations:", len(c[c == 'See Drawing'])

for x in range(len(c)):
    if pd.isnull(c[x]):
        continue
    else:
        try:
            c[x] = float(c[x])
        except:
            c[x] = np.nan

train['nominal_size_1_comp1'] = c
print train['nominal_size_1_comp1'].unique()


# of 'See Drawing' observations: 21
[nan 12.7 9.52 3.18 15.88 34.0 25.4 27.2 7.94 19.05 4.76 22.22 38.1 17.14
 17.47]


In [138]:
c = train.loc[:,'nominal_size_1_comp1'].copy()
print "# of 'See Drawing' observations:", len(c[c == 'See Drawing'])

for x in range(len(c)):
    if pd.isnull(c[x]):
        continue
    else:
        try:
            c[x] = float(c[x])
        except:
            c[x] = np.nan
        
print c.unique()

# of 'See Drawing' observations: 21
[nan 12.7 9.52 3.18 15.88 34.0 25.4 27.2 7.94 19.05 4.76 22.22 38.1 17.14
 17.47]


### Testing looping function for encoding variables

In [163]:
def whichcolumns(df, percentnull = 0.9):
# Returns a list of which columns have greater than or equal to 90% null values
    col = []
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            col += [x]
    return col

def dropcolumns(df, percentnull = 0.9):
    for x in df.columns:
        if np.mean(pd.isnull(df[x])) >= percentnull:
            df = df.drop(x,axis = 1)
    return df

c = train.loc[:,['component_id_3','cost','component_id_1','weight_id_1','bracket_pricing','nominal_size_1_comp1']].copy()
# dropcolumns(c).head()
whichcolumns(train, 0.9999)

['nominal_size_3_comp2',
 'diameter_comp3',
 'thread_size_4_comp3',
 'thread_size_3_comp3',
 'length_4_comp3',
 'thread_pitch_4_comp3',
 'thread_pitch_3_comp3',
 'nominal_size_1_comp4',
 'elbow_angle_comp4',
 'thread_size_3_comp4',
 'thread_pitch_3_comp4',
 'hose_diameter_comp4',
 'shoulder_diameter_comp4',
 'nominal_size_2_comp5',
 'nominal_size_1_comp5',
 'length_1_comp5',
 'adaptor_angle_comp5',
 'length_2_comp5',
 'extension_length_comp5',
 'drop_length_comp5',
 'head_diameter_comp5',
 'hex_size_comp6',
 'overall_length_comp6',
 'bolt_pattern_wide_comp6',
 'thickness_comp6',
 'bolt_pattern_long_comp6',
 'height_over_tube_comp6',
 'base_diameter_comp6',
 'quantity_8',
 'weight_id_8']

In [150]:
c = train.loc[:,'nominal_size_1_comp1'].copy()
np.mean(pd.isnull(c))
# list(np.unique(c))

0.94345553202752774

In [None]:
## Testing the encode function

train2 = train.copy()
c = train.loc[:,['component_id_3','cost','component_id_1','weight_id_1','bracket_pricing']].copy()
train3 = encode(train2, 1)

In [None]:
## Testing

t = train.iloc[:,1].copy()
le = PruneLabelEncoder()
le.fit(t, 50)
s = le.transform(t)
print np.bincount(s)
print np.unique(s)

In [124]:
c = train.loc[:,['component_id_3','cost','component_id_1','weight_id_1','bracket_pricing']].copy()

for x in c.columns:
    if type(c[x].unique()[1]) == str:
        le2 = PruneLabelEncoder()
        le2.fit(c[x],1)
        c[x] = le2.transform(c[x])

c.head()        
#print c2[:15]
#print c[:15]
#print type(c.unique()[1]) == str

Unnamed: 0,component_id_3,cost,component_id_1,weight_id_1,bracket_pricing
0,0,21.905933,105,0.009,1
1,0,12.341214,105,0.009,1
2,0,6.601826,105,0.009,1
3,0,4.68777,105,0.009,1
4,0,3.541561,105,0.009,1


In [130]:
c = train.loc[:,'nominal_size_1_comp1'].copy()
print "# of 'See Drawing' observations:", len(c[c == 'See Drawing'])
#le2 = PruneLabelEncoder()
#le2.fit(c,1)
#c2 = le2.transform(c)
#print c2[:15]
#print c[:15]
for x in range(len(c)):
    if pd.isnull(c[x]):
        continue
    else:
        try:
            c[x] = float(c[x])
        except:
            c[x] = np.nan
        
print c.unique()
#print type(c.unique()[1]) == str

# of 'See Drawing' observations: 21
[nan 12.7 9.52 3.18 15.88 34.0 25.4 27.2 7.94 19.05 4.76 22.22 38.1 17.14
 17.47]


In [119]:
c = train.copy()

for col in c.columns:
    if (c[col]) == str

    for x in range(len(c)):
        if pd.isnull(c[x]):
            continue
        else:
            try:
                c[x] = float(c[x])
            except:
                c[x] = np.nan

TypeError: 'Int64Index' object is not callable

In [69]:

d = train.loc[:,'bolt_pattern_wide_comp1'].copy()
d.unique()

array([    nan,   79.38,   22.2 ,   44.45,   88.9 ,   22.22,   26.19,
         30.2 ,   22.23,   77.8 ,   27.8 ,   61.93,   36.5 ,   35.7 ,
         69.9 ,   92.1 ,   42.  ,   69.92,  120.  ,   50.8 ,   26.2 ,
         31.75,   30.18,   27.76,   61.9 ,   42.9 ,   69.85,  110.  ,
         35.71,   42.88,   95.4 ,   31.8 ,   23.8 ,   17.  ])

## Checking if any column is all null or fewer than 2 unique values

In [77]:
e = train.copy()
count = 0
for x in train.columns:
    if pd.isnull(train[x]).all():
        print x
        count += 1
print count

0


In [78]:
count = 0
for x in train.columns:
    if len(train[x].unique()) < 2:
        print x
        count += 1
print count

0


In [80]:
count = 0
for x in train.columns:
    if len(train[x].unique()) == 2:
        print train[x].unique()
        count += 1
print count

['Yes' 'No']
[nan 'No']
[nan 'No']
[ nan  90.]
[nan 'MJ-006']
[nan 'ORFS-SAE J1453']
[ nan   1.]
[nan 'Male (Stud)']
[  nan  35.8]
[ nan  14.]
[nan 'Round']
[nan 'No']
[nan 'No']
[ nan  37.]
[ nan  90.]
[   nan  15.88]
[nan 'ORFS-SAE J1453']
[ nan  12.]
[nan 'No']
[nan 'No']
[ nan  37.]
[ nan  90.]
[ nan  90.]
[   nan  1.187]
[nan 'ORFS-SAE J1453']
[nan 'ORFS-SAE J1453']
[   nan  1.187]
[nan 'Male (Stud)']
[  nan  41.7]
[ nan  12.]
[ nan  12.]
[nan 'SP-0095']
[ nan  90.]
[nan 'ORFS-SAE J1453']
[ nan  90.]
[   nan  15.88]
[nan 'ORFS-SAE J1453']
[   nan  0.812]
[ nan  16.]
[ nan  4.8]
[nan 'SP-0016']
[nan 'C-0855']
[nan 'SP-0098']
[  nan  9.52]
[   nan  31.75]
[  nan  46.7]
[ nan  90.]
[  nan  44.5]
[  nan  53.1]
[nan 'MJ-003']
[nan 'Threaded']
[  nan  25.5]
[nan 'Yes']
[ nan  4.8]
[nan 'SP-0098']
[  nan  50.8]
[nan 'Saddle']
[ nan  90.]
[  nan  22.2]
[nan 'ORFS-SAE J1453']
[nan 'Male (Stud)']
[nan 'Braze-Weld Socket']
[  nan  24.7]
[  nan  37.3]
[  nan  37.3]
[nan 'MJ-003']
[  nan  36.5

In [71]:
pd.isnull(train['bolt_pattern_wide_comp1']).all()

False

In [133]:
lbl_id = []
n = len(train.index)
A = np.zeros((n,1),int)
for i in range(0,n-1):
    A[i] = i     
print A


[[    0]
 [    1]
 [    2]
 ..., 
 [60445]
 [60446]
 [    0]]


In [134]:
for i in range(1,n-1):
    print i
    if isinstance(train.iloc[:,i].values[0],int):
        names_id = 'lbl_'+names[i]+'_id'
        lbl_id.append(names_id)  
        names_trans = train.iloc[:,i].values
        names_trans = np.reshape(names_trans,(n,1))
        A = np.concatenate((A,names_trans),1)
    else:
        names_id = 'lbl_'+names[i]+'_id'
        lbl_id.append(names_id)  
        names_id = PruneLabelEncoder()
        names_id.fit(train.iloc[:,i].values, cutoff=TRANSFORM_CUTOFF)
        names_trans = names_id.transform(train.iloc[:,i].values)
        names_trans = np.reshape(names_trans,(n,1))
        A = np.concatenate((A,names_trans),1)
break

1
2
3
4
5
6
7
8
9
10


ValueError: y contains new labels: [ nan  nan  nan ...,  nan  nan  nan]

In [67]:
list(train.columns)

['tube_assembly_id',
 'supplier',
 'year',
 'month',
 'day',
 'annual_usage',
 'min_order_quantity',
 'bracket_pricing',
 'quantity',
 'cost',
 'component_id_1',
 'quantity_1',
 'weight_id_1',
 'component_type_id_comp1',
 'plating_comp1',
 'orientation_comp1',
 'intended_nut_thread_comp1',
 'unique_feature_comp1',
 'connection_type_id_comp1',
 'intended_nut_pitch_comp1',
 'length_comp1',
 'blind_hole_comp1',
 'diameter_comp1',
 'seat_angle_comp1',
 'thread_pitch_comp1',
 'hex_nut_size_comp1',
 'thread_size_comp1',
 'end_form_id_2_comp1',
 'overall_length_comp1',
 'end_form_id_1_comp1',
 'nominal_size_2_comp1',
 'nominal_size_1_comp1',
 'thread_pitch_1_comp1',
 'length_1_comp1',
 'adaptor_angle_comp1',
 'length_2_comp1',
 'thread_pitch_2_comp1',
 'connection_type_id_1_comp1',
 'connection_type_id_2_comp1',
 'thread_size_1_comp1',
 'thread_size_2_comp1',
 'hex_size_comp1',
 'extension_length_comp1',
 'mj_class_code_comp1',
 'bolt_pattern_wide_comp1',
 'thickness_comp1',
 'mj_plug_class_c