In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, tree
from sklearn.model_selection import train_test_split
import pickle

data = pd.read_csv("bra_data.csv", low_memory=False)
data.head()

Unnamed: 0,Age,Height (ft-in),Weight (lb),Shoulder Pain,R/L/B/P,Bra Color,Bra Size,Embroidery,Jacquard,Bra Style,...,Brand,Wide Straps,Padded Straps,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure
0,48,"5'0""",151,N,L,black,42A,N,N,Amber,...,Amoena,N,N,N,Y,Y,Y,N,Y,B
1,48,"5'0""",151,N,L,black,42A,N,N,Amber,...,Amoena,N,N,N,Y,Y,Y,N,Y,B
2,52,"5'0""",140,Y,B,black,38AA,N,N,Amber,...,Amoena,N,N,N,Y,Y,Y,N,Y,B
3,52,"5'0""",140,Y,B,black,38AA,N,N,Amber,...,Amoena,N,N,N,Y,Y,Y,N,Y,B
4,53,"5'1""",151,N,L,black,36A,N,N,Amber,...,Amoena,N,N,N,Y,Y,Y,N,Y,B


In [2]:
data.dtypes

Age                          int64
Height (ft-in)              object
Weight (lb)                  int64
Shoulder Pain               object
R/L/B/P                     object
Bra Color                   object
Bra Size                    object
Embroidery                  object
Jacquard                    object
Bra Style                   object
Difficulty Reaching Back    object
Brand                       object
Wide Straps                 object
Padded Straps               object
Wired                       object
Molded Foam                 object
Smooth Cup                  object
Has Lace Details            object
Active                      object
Lace Inset                  object
Front/Back Closure          object
dtype: object

In [3]:
def parse_ht(x):
    # format: 7'0"
    ht = str(x).split("\'")
    ft = float(ht[0])
    inch = float(ht[1].replace('"',''))
    return (12*ft) + inch

def band_size(size):
    band = int(size[0:2])
    return band

def cup_size(size):
    cup = size[2:]
    return cup

In [4]:
data["Ht (inches)"] = data["Height (ft-in)"].apply(lambda x:parse_ht(x))

In [5]:
data["Band Size"] = data["Bra Size"].apply(lambda x:band_size(x))
data["Cup Size"] = data["Bra Size"].apply(lambda x:cup_size(x))

In [6]:
data.head()

Unnamed: 0,Age,Height (ft-in),Weight (lb),Shoulder Pain,R/L/B/P,Bra Color,Bra Size,Embroidery,Jacquard,Bra Style,...,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Ht (inches),Band Size,Cup Size
0,48,"5'0""",151,N,L,black,42A,N,N,Amber,...,N,Y,Y,Y,N,Y,B,60.0,42,A
1,48,"5'0""",151,N,L,black,42A,N,N,Amber,...,N,Y,Y,Y,N,Y,B,60.0,42,A
2,52,"5'0""",140,Y,B,black,38AA,N,N,Amber,...,N,Y,Y,Y,N,Y,B,60.0,38,AA
3,52,"5'0""",140,Y,B,black,38AA,N,N,Amber,...,N,Y,Y,Y,N,Y,B,60.0,38,AA
4,53,"5'1""",151,N,L,black,36A,N,N,Amber,...,N,Y,Y,Y,N,Y,B,61.0,36,A


In [7]:
data.dtypes

Age                           int64
Height (ft-in)               object
Weight (lb)                   int64
Shoulder Pain                object
R/L/B/P                      object
Bra Color                    object
Bra Size                     object
Embroidery                   object
Jacquard                     object
Bra Style                    object
Difficulty Reaching Back     object
Brand                        object
Wide Straps                  object
Padded Straps                object
Wired                        object
Molded Foam                  object
Smooth Cup                   object
Has Lace Details             object
Active                       object
Lace Inset                   object
Front/Back Closure           object
Ht (inches)                 float64
Band Size                     int64
Cup Size                     object
dtype: object

In [10]:
data["Ht (inches)"] = data["Ht (inches)"].astype(int)

In [11]:
data.dtypes

Age                          int64
Height (ft-in)              object
Weight (lb)                  int64
Shoulder Pain               object
R/L/B/P                     object
Bra Color                   object
Bra Size                    object
Embroidery                  object
Jacquard                    object
Bra Style                   object
Difficulty Reaching Back    object
Brand                       object
Wide Straps                 object
Padded Straps               object
Wired                       object
Molded Foam                 object
Smooth Cup                  object
Has Lace Details            object
Active                      object
Lace Inset                  object
Front/Back Closure          object
Ht (inches)                  int32
Band Size                    int64
Cup Size                    object
dtype: object

In [12]:
new_df = data[['Age',
 'Weight (lb)',
 'Ht (inches)',
 'Band Size',
 'Cup Size',
 'R/L/B/P',
 'Shoulder Pain',
 'Bra Color',
 'Brand',
 'Wide Straps',
 'Padded Straps',
 'Wired',
 'Molded Foam',
 'Smooth Cup',
 'Has Lace Details',
 'Active',
 'Lace Inset',
 'Front/Back Closure',
 'Difficulty Reaching Back',
 'Embroidery',
 'Jacquard',               
 'Bra Style']]

In [13]:
X = new_df.select_dtypes(include=[object])
X.head(3)

Unnamed: 0,Cup Size,R/L/B/P,Shoulder Pain,Bra Color,Brand,Wide Straps,Padded Straps,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Difficulty Reaching Back,Embroidery,Jacquard,Bra Style
0,A,L,N,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N,Amber
1,A,L,N,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N,Amber
2,AA,B,Y,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N,Amber


In [14]:
data_to_be_encoded = X.iloc[:,:-1]
data_to_be_encoded.head()

Unnamed: 0,Cup Size,R/L/B/P,Shoulder Pain,Bra Color,Brand,Wide Straps,Padded Straps,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Difficulty Reaching Back,Embroidery,Jacquard
0,A,L,N,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N
1,A,L,N,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N
2,AA,B,Y,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N
3,AA,B,Y,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N
4,A,L,N,black,Amoena,N,N,N,Y,Y,Y,N,Y,B,N,N,N


In [15]:
le = preprocessing.LabelEncoder()
X_2 = data_to_be_encoded.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')
X_2.head()

Unnamed: 0,Cup Size,R/L/B/P,Shoulder Pain,Bra Color,Brand,Wide Straps,Padded Straps,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Difficulty Reaching Back,Embroidery,Jacquard
0,0,1,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0
1,0,1,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0
2,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0
3,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0
4,0,1,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0


In [16]:
int_df = new_df[["Age", "Weight (lb)", "Ht (inches)", "Band Size"]]
int_df.head()

Unnamed: 0,Age,Weight (lb),Ht (inches),Band Size
0,48,151,60,42
1,48,151,60,42
2,52,140,60,38
3,52,140,60,38
4,53,151,61,36


In [17]:
encoded_df = int_df.join(X_2)

encoded_df.head()

Unnamed: 0,Age,Weight (lb),Ht (inches),Band Size,Cup Size,R/L/B/P,Shoulder Pain,Bra Color,Brand,Wide Straps,...,Wired,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Difficulty Reaching Back,Embroidery,Jacquard
0,48,151,60,42,0,1,0,0,1,0,...,0,1,1,1,0,1,0,0,0,0
1,48,151,60,42,0,1,0,0,1,0,...,0,1,1,1,0,1,0,0,0,0
2,52,140,60,38,1,0,1,0,1,0,...,0,1,1,1,0,1,0,0,0,0
3,52,140,60,38,1,0,1,0,1,0,...,0,1,1,1,0,1,0,0,0,0
4,53,151,61,36,0,1,0,0,1,0,...,0,1,1,1,0,1,0,0,0,0


In [18]:
final_clean_df = encoded_df.join(new_df["Bra Style"])
final_clean_df.head()

Unnamed: 0,Age,Weight (lb),Ht (inches),Band Size,Cup Size,R/L/B/P,Shoulder Pain,Bra Color,Brand,Wide Straps,...,Molded Foam,Smooth Cup,Has Lace Details,Active,Lace Inset,Front/Back Closure,Difficulty Reaching Back,Embroidery,Jacquard,Bra Style
0,48,151,60,42,0,1,0,0,1,0,...,1,1,1,0,1,0,0,0,0,Amber
1,48,151,60,42,0,1,0,0,1,0,...,1,1,1,0,1,0,0,0,0,Amber
2,52,140,60,38,1,0,1,0,1,0,...,1,1,1,0,1,0,0,0,0,Amber
3,52,140,60,38,1,0,1,0,1,0,...,1,1,1,0,1,0,0,0,0,Amber
4,53,151,61,36,0,1,0,0,1,0,...,1,1,1,0,1,0,0,0,0,Amber


In [19]:
target = final_clean_df["Bra Style"]
data = final_clean_df.drop("Bra Style", axis=1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [21]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)

0.9706840390879479

In [17]:
dtc.predict([[69, 62, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1]])

array(['Barbara'], dtype=object)

In [18]:
filename = 'decision_tree_model.sav'
pickle.dump(dtc, open(filename, 'wb'))

In [19]:
# Run decision tree model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9706840390879479
