In [1]:
import os
from pathlib2 import Path
import re
import itertools
import copy
import numpy as np
from IPython.display import display_html 

import pyarrow.feather as feather
import pandas as pd
import xml.etree.ElementTree as ET
from dython.nominal import associations
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## 1. Exploratory Data Analysis

This analysis is primarily focused on Categroy 909 to get an understanding of the dataset and device an algorithmic approach for missing data. Learnings from here will be expanded to other categories, however not all as some work might only be applicable to this category.

In [2]:
category_data = feather.read_feather(Path('IceCat_Cat_2833_feather/frame_IceCat_Category_909.feather'))
display(category_data.tail())
display(category_data.info())

Unnamed: 0,level_0,index,id,name,category_id,category_label,Operating temperature (T-T).1112,Operating temperature (T-T).1112.unit,Maximum data transfer rate.1165,Maximum data transfer rate.1165.unit,...,Compatibility.890,Compatibility.890.unit,Interface type.990,Interface type.990.unit,Firewall security.1612,Firewall security.1612.unit,Works with the Google Assistant.36516,Works with the Google Assistant.36516.unit,RTS/CTS threshold.22398,RTS/CTS threshold.22398.unit
2232,2232,0,IceCat_Prod_16048998,Aironet 2602E,IceCat_Category_909,Wireless Access Points,-20 - 55,°C,450.0,Mbit/s,...,,,,,,,,,,
2233,2233,0,IceCat_Prod_31985083,Aironet 3700i,IceCat_Category_909,Wireless Access Points,0 - 40,°C,1300.0,Mbit/s,...,,,,,,,,,,
2234,2234,0,IceCat_Prod_33721921,Aironet 3600e,IceCat_Category_909,Wireless Access Points,-20 - 55,°C,1000.0,Mbit/s,...,,,,,,,,,,
2235,2235,0,IceCat_Prod_11147703,NWA5550-N,IceCat_Category_909,Wireless Access Points,-40 - 60,°C,300.0,Mbit/s,...,,,,,,,,,,
2236,2236,0,IceCat_Prod_36197901,Aironet 1562E,IceCat_Category_909,Wireless Access Points,-40 - 65,°C,1300.0,Mbit/s,...,,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Columns: 540 entries, level_0 to RTS/CTS threshold.22398.unit
dtypes: boolean(88), float64(69), int64(2), object(381)
memory usage: 8.1+ MB


None

###  1.1 Data Pruning

The dataset has several columns that have missing data. A reasonable threshold is therefore necessary to select columns that have a substantially complete set of values. Given that 100% would result in only product reference columns, 80% was selected which resulted in 29 columns:

In [3]:
pct_complete_threshold = .8
pct_complete = (len(category_data) - category_data.isnull().sum()) / len(category_data)
missing_vals = pd.DataFrame({'col': category_data.columns,
                             'pct_complete': pct_complete})
num_cols_above_null_threshold = missing_vals[missing_vals['pct_complete'] >= (pct_complete_threshold)].count()

cols_of_interest = missing_vals[missing_vals['pct_complete'] >= pct_complete_threshold].iloc[:,0]

print(f"Columns above {pct_complete_threshold:.0%} percent complete threshold ({num_cols_above_null_threshold[0]}):")
display(cols_of_interest.values)

Columns above 80% percent complete threshold (29):


array(['level_0', 'index', 'id', 'name', 'category_id', 'category_label',
       'Operating temperature (T-T).1112',
       'Operating temperature (T-T).1112.unit',
       'Maximum data transfer rate.1165',
       'Maximum data transfer rate.1165.unit', 'Height.1464',
       'Height.1464.unit', 'Width.1649', 'Width.1649.unit', 'Depth.1650',
       'Depth.1650.unit', 'Product colour.1766',
       'Networking standards.1802', '2.4 GHz.20806', '5 GHz.20807',
       'Ethernet LAN (RJ-45) ports.2312', 'Ethernet LAN data rates.3768',
       'Security algorithms.454', 'Storage temperature (T-T).757',
       'Storage temperature (T-T).757.unit', 'Weight.94',
       'Weight.94.unit', 'Operating relative humidity (H-H).703',
       'Operating relative humidity (H-H).703.unit'], dtype=object)

Columns can be broken into two buckets:
- *Unit based columns*: these will be filled based on allowable values based on *CategoryFeatureslist.xml* file as filled values are consistent within columns
- *Non-unit based columns*: these will be further investigated for possible inclusion in imputation analysis

In [4]:
cols_units = [str for str in cols_of_interest.values if ".unit" in str]
cols_sys = ["level_0", "index", "id", "category_id", "category_label"]
cols_non_units = set(cols_of_interest.values) - set(cols_units) - set(cols_sys)
print(f"Unit based columns ({len(cols_units)}):")
display(cols_units)
print(f"System reference columns ({len(cols_sys)}):")
display(cols_sys)
print(f"Non-unit based columns ({len(cols_non_units)}):")
display(list(cols_non_units))

Unit based columns (8):


['Operating temperature (T-T).1112.unit',
 'Maximum data transfer rate.1165.unit',
 'Height.1464.unit',
 'Width.1649.unit',
 'Depth.1650.unit',
 'Storage temperature (T-T).757.unit',
 'Weight.94.unit',
 'Operating relative humidity (H-H).703.unit']

System reference columns (5):


['level_0', 'index', 'id', 'category_id', 'category_label']

Non-unit based columns (16):


['name',
 'Product colour.1766',
 '5 GHz.20807',
 'Networking standards.1802',
 'Height.1464',
 'Operating relative humidity (H-H).703',
 'Depth.1650',
 'Weight.94',
 'Storage temperature (T-T).757',
 'Maximum data transfer rate.1165',
 '2.4 GHz.20806',
 'Security algorithms.454',
 'Width.1649',
 'Operating temperature (T-T).1112',
 'Ethernet LAN (RJ-45) ports.2312',
 'Ethernet LAN data rates.3768']

When we look at the uniqueness of each column, two columns (`2.4 GHz.20806` & `5 GHz.20807`) stand out as having only one value throughout each column (other than null) and are therefore removed:

In [5]:
uniqueness = category_data[list(cols_non_units)].apply(pd.Series.nunique).sort_values()
print("Variation within each column:")
display(uniqueness)
low_variation_cols = list(uniqueness[uniqueness == 1].index)
print(f'Columns with no variation: {low_variation_cols}')
cols = [col for col in list(cols_non_units) if col not in low_variation_cols]
print(f'\nColumns to be considered:')
display(cols)

Variation within each column:


5 GHz.20807                                1
2.4 GHz.20806                              1
Ethernet LAN (RJ-45) ports.2312            8
Product colour.1766                       12
Ethernet LAN data rates.3768              18
Operating relative humidity (H-H).703     27
Storage temperature (T-T).757             31
Operating temperature (T-T).1112          33
Maximum data transfer rate.1165           62
Width.1649                               186
Height.1464                              195
Depth.1650                               214
Weight.94                                233
Networking standards.1802                365
Security algorithms.454                  422
name                                     787
dtype: int64

Columns with no variation: ['5 GHz.20807', '2.4 GHz.20806']

Columns to be considered:


['name',
 'Product colour.1766',
 'Networking standards.1802',
 'Height.1464',
 'Operating relative humidity (H-H).703',
 'Depth.1650',
 'Weight.94',
 'Storage temperature (T-T).757',
 'Maximum data transfer rate.1165',
 'Security algorithms.454',
 'Width.1649',
 'Operating temperature (T-T).1112',
 'Ethernet LAN (RJ-45) ports.2312',
 'Ethernet LAN data rates.3768']

Below is a preview of the data thus far after dropping observations w/ null values (maybe consider including in next iteration?). Note that there are columns with several attributes within a cell. For example, for 1st observation below the `Networking standards.1802` attribute includes several values separated by a comma: `IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 802.11n,IEEE 802.3af`. To appropriately capture the attributes for each observation, dummy variables will need to be created. 

In [6]:
pruned_data = category_data[cols].dropna()
pruned_data.head()

Unnamed: 0,name,Product colour.1766,Networking standards.1802,Height.1464,Operating relative humidity (H-H).703,Depth.1650,Weight.94,Storage temperature (T-T).757,Maximum data transfer rate.1165,Security algorithms.454,Width.1649,Operating temperature (T-T).1112,Ethernet LAN (RJ-45) ports.2312,Ethernet LAN data rates.3768
6,Aironet 702i,White,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 80...",50.4,10 - 90,177.6,480.0,-30 - 70,1000.0,"EAP,EAP-SIM,EAP-TLS,EAP-TTLS,PEAP,TKIP,WPA,WPA2",177.6,0 - 40,2.0,101001000
7,Aironet 2802i,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",55.1,10 - 90,220.4,1600.0,-30 - 70,5200.0,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",220.0,-20 - 50,2.0,1001000
10,Aironet 2802i,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",55.1,10 - 90,220.4,1600.0,-30 - 70,5200.0,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",220.0,-20 - 50,2.0,1001000
12,Aironet 1815w,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",31.5,10 - 90,140.0,280.0,-30 - 70,1000.0,"802.1x RADIUS,AES,EAP,EAP-FAST,EAP-PEAP,EAP-SI...",89.0,0 - 40,3.0,101001000
14,Aironet 2602i,White,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11d,IEEE 80...",221.0,10 - 90,54.0,1040.0,-30 - 70,1000.0,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",221.0,0 - 40,1.0,101001000


The columns identified that would need dummy variables created are:
- `Product colour.1766`
- `Networking standards.1802`
- `Ethernet LAN data rates.3768 `
- `Security algorithms.454` 

In [7]:
cols_need_dummies = ["Product colour.1766", "Networking standards.1802", "Ethernet LAN data rates.3768", "Security algorithms.454"]
pd.DataFrame({"col": cols_need_dummies, 
              "dummy_var_count": [pruned_data[col].str.get_dummies(",").shape[1] for col in cols_need_dummies]})

Unnamed: 0,col,dummy_var_count
0,Product colour.1766,6
1,Networking standards.1802,40
2,Ethernet LAN data rates.3768,6
3,Security algorithms.454,48


In [8]:
xgdata = pd.concat([pruned_data, 
                    pruned_data["Product colour.1766"].str.get_dummies(","),
                    pruned_data["Networking standards.1802"].str.get_dummies(","),
                    pruned_data["Ethernet LAN data rates.3768"].str.get_dummies(","),
                    pruned_data["Security algorithms.454"].str.get_dummies(",")],
                    axis=1) \
            .drop(columns=cols_need_dummies)

print(f'The new dataset is now {xgdata.shape[0]:,} by {xgdata.shape[1]}.\n\nBelow is a preview:')

The new dataset is now 1,316 by 110.

Below is a preview:


In [9]:
xgdata.head()

Unnamed: 0,name,Height.1464,Operating relative humidity (H-H).703,Depth.1650,Weight.94,Storage temperature (T-T).757,Maximum data transfer rate.1165,Width.1649,Operating temperature (T-T).1112,Ethernet LAN (RJ-45) ports.2312,...,WPA2,WPA2-AES,WPA2-CCMP,WPA2-Enterprise,WPA2-PSK,WPA2-TKIP,WPA3,WPA3-Enterprise,WPA3-PSK,WPS
6,Aironet 702i,50.4,10 - 90,177.6,480.0,-30 - 70,1000.0,177.6,0 - 40,2.0,...,1,0,0,0,0,0,0,0,0,0
7,Aironet 2802i,55.1,10 - 90,220.4,1600.0,-30 - 70,5200.0,220.0,-20 - 50,2.0,...,1,0,0,0,0,0,0,0,0,0
10,Aironet 2802i,55.1,10 - 90,220.4,1600.0,-30 - 70,5200.0,220.0,-20 - 50,2.0,...,1,0,0,0,0,0,0,0,0,0
12,Aironet 1815w,31.5,10 - 90,140.0,280.0,-30 - 70,1000.0,89.0,0 - 40,3.0,...,1,0,0,0,0,0,0,0,0,0
14,Aironet 2602i,221.0,10 - 90,54.0,1040.0,-30 - 70,1000.0,221.0,0 - 40,1.0,...,1,0,0,0,0,0,0,0,0,0


Dummy variables will also need to be created on the below categorical columns:

In [10]:
add_col_dummies = list(xgdata.select_dtypes(['object']).columns)
print(add_col_dummies)

['name', 'Operating relative humidity (H-H).703', 'Storage temperature (T-T).757', 'Operating temperature (T-T).1112']


In [11]:
xgdata_final = pd.concat([xgdata, 
                    pd.get_dummies(xgdata[add_col_dummies])],
                    axis=1) \
                 .drop(columns=add_col_dummies)

In [12]:
xgdata_final

Unnamed: 0,Height.1464,Depth.1650,Weight.94,Maximum data transfer rate.1165,Width.1649,Ethernet LAN (RJ-45) ports.2312,Black,Bronze,Gold,Grey,...,Operating temperature (T-T).1112_-40 - 60,Operating temperature (T-T).1112_-40 - 65,Operating temperature (T-T).1112_-40 - 70,Operating temperature (T-T).1112_-40 - 85,Operating temperature (T-T).1112_0 - 40,Operating temperature (T-T).1112_0 - 45,Operating temperature (T-T).1112_0 - 50,Operating temperature (T-T).1112_0 - 55,Operating temperature (T-T).1112_0 - 65,Operating temperature (T-T).1112_10 - 90
6,50.4,177.6,480.0,1000.0,177.6,2.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,55.1,220.4,1600.0,5200.0,220.0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,55.1,220.4,1600.0,5200.0,220.0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,31.5,140.0,280.0,1000.0,89.0,3.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
14,221.0,54.0,1040.0,1000.0,221.0,1.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2228,62.5,220.0,2090.0,5200.0,220.0,2.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2230,51.0,221.0,1000.0,1000.0,221.0,3.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2231,31.0,123.0,303.0,867.0,123.0,1.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2233,54.0,221.0,1130.0,1300.0,221.0,2.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


The dataset is now ready for further analysis for imputation techniques

# TO BE CONTINUED!

## 2. Data Imputation

In [71]:
def remove_data(df, p_rm, id_cols = []):
    df_copy = copy.deepcopy(df)

    for col in df_copy:
        if col in id_cols:
            #continue
            n_val = df_copy[col].notna().sum()
            n_rm = int(n_val * p_rm)
            idx_ = np.random.choice(df_copy[col].shape[0], n_rm, replace=False)

            df_copy.loc[idx_, col] = np.nan

    return df_copy 

In [86]:
category_data_xgboost = category_data[["name", "Height.1464", "Width.1649", "Depth.1650", "Maximum data transfer rate.1165", "Product colour.1766"]]
category_data_xgboost = category_data_xgboost[~(category_data_xgboost.isin(["None", "nan", "NaN"]).any(axis=1) | category_data_xgboost.isnull().any(axis=1))].reset_index(drop=True)

category_data_xgboost_prod_col_rm = remove_data(category_data_xgboost, 0.3, ["Product colour.1766"])
category_data_xgboost_col_nan_idx = category_data_xgboost_prod_col_rm[(category_data_xgboost_prod_col_rm["Product colour.1766"].isnull()) | (category_data_xgboost_prod_col_rm["Product colour.1766"] == "None")].index
category_data_xgboost_train = category_data_xgboost_prod_col_rm.drop(category_data_xgboost_col_nan_idx, axis=0)
category_data_xgboost_test = category_data_xgboost_prod_col_rm[category_data_xgboost_prod_col_rm.index.isin(category_data_xgboost_col_nan_idx)]

#specific to this use case
cols = ["Height.1464", "Width.1649", "Depth.1650", "Maximum data transfer rate.1165"]
category_data_xgboost_train[cols] = category_data_xgboost_train[cols].astype(float)
category_data_xgboost_test[cols] = category_data_xgboost_test[cols].astype(float)
###


display(category_data_xgboost_train)
display(category_data_xgboost_test)

Unnamed: 0,name,Height.1464,Width.1649,Depth.1650,Maximum data transfer rate.1165,Product colour.1766
0,Aironet 1550,163.0,312.0,229.0,1000.0,White
1,Aironet 1550,163.0,312.0,229.0,1000.0,White
2,NWA1123ACv3,37.5,140.0,141.0,866.0,White
5,Aironet 702i,50.4,177.6,177.6,1000.0,White
6,Aironet 2802i,55.1,220.0,220.4,5200.0,White
...,...,...,...,...,...,...
1770,Aironet 1550,142.0,312.0,229.0,300.0,White
1772,Aironet 1600,47.0,221.0,221.0,300.0,Grey
1773,Aironet 3802i,62.5,220.0,220.0,5200.0,White
1775,WAP125,31.0,123.0,123.0,867.0,White


Unnamed: 0,name,Height.1464,Width.1649,Depth.1650,Maximum data transfer rate.1165,Product colour.1766
3,Aironet 1572EAC,160.0,300.0,201.0,1300.0,
4,WIRELESS ACCESS POINT,160.0,300.0,201.0,1300.0,
7,Aironet 2802i,55.1,220.0,220.4,5200.0,
9,Aironet 2602i,221.0,221.0,54.0,1000.0,
12,Aironet 3802i,62.5,220.0,220.0,5200.0,
...,...,...,...,...,...,...
1769,Aironet 3802i,62.5,220.0,220.0,5200.0,
1771,DAP-2695,36.5,190.0,198.8,1750.0,
1774,Aironet 1700,51.0,221.0,221.0,1000.0,
1776,Aironet 3700i,54.0,221.0,221.0,1300.0,


## Working XGBoost

In [180]:
category_data_xgboost.groupby("Product colour.1766").count()

Unnamed: 0_level_0,name,Height.1464,Width.1649,Depth.1650,Maximum data transfer rate.1165
Product colour.1766,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Beige,2,2,2,2,2
Black,34,34,34,34,34
"Black,Silver",2,2,2,2,2
"Bronze,White",2,2,2,2,2
"Gold,White",1,1,1,1,1
Grey,245,245,245,245,245
"Grey,White",1,1,1,1,1
Silver,2,2,2,2,2
White,1490,1490,1490,1490,1490


In [None]:
input_data = category_data_xgboost[~category_data_xgboost["Product colour.1766"].isin(["Grey,White",  "Gold,White"])] # Got rid of 2 colors as they only showed up once
float_cols = ["Height.1464", "Width.1649", "Depth.1650", "Maximum data transfer rate.1165"]
int_cols = ["Product colour.1766"]
input_data[float_cols] = input_data[float_cols].astype(float)

y_mapping = {v:int(k) for k, v in enumerate(input_data["Product colour.1766"].unique())}

X_cols = input_data.drop(columns=["name", "Product colour.1766"])
y_cols = input_data["Product colour.1766"].map(y_mapping)

X_train, X_test, y_train, y_test = train_test_split(X_cols, y_cols, random_state=42, stratify=y_cols)

In [178]:
display(pd.concat([X_train, y_train], axis=1))
display(pd.concat([X_test, y_test], axis=1))


Unnamed: 0,Height.1464,Width.1649,Depth.1650,Maximum data transfer rate.1165,Product colour.1766
837,170.0,230.0,100.0,1000.0,1
510,221.0,221.0,54.0,1000.0,0
1253,54.0,221.0,221.0,1300.0,0
1564,47.0,221.0,221.0,300.0,0
606,36.0,140.0,86.0,1267.0,4
...,...,...,...,...,...
579,50.8,210.8,210.8,1000.0,0
109,33.0,150.8,150.8,1000.0,0
406,54.0,221.0,221.0,1300.0,0
1599,54.0,221.0,221.0,450.0,0


Unnamed: 0,Height.1464,Width.1649,Depth.1650,Maximum data transfer rate.1165,Product colour.1766
1169,51.0,221.0,221.0,1300.0,0
456,43.0,230.0,230.0,900.0,0
22,57.0,236.0,184.0,1800.0,2
264,55.1,220.0,220.4,5200.0,0
6,55.1,220.0,220.4,5200.0,0
...,...,...,...,...,...
833,35.0,204.0,134.0,300.0,2
1048,50.8,210.8,210.8,2000.0,0
731,61.0,200.0,150.0,1100.0,1
1074,38.0,205.4,181.6,1300.0,0


In [170]:
clf_xgb = xgb.XGBClassifier(objective="reg:logistic", seed=42)
clf = clf_xgb.fit(X_train,
            y_train)

## Sense of accuracy

In [171]:
predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=[0, 1, 2, 3, 5])
cm

array([[361,  11,   1,   0,   0],
       [  5,  56,   0,   0,   0],
       [  2,   0,   7,   0,   0],
       [  1,   0,   0,   0,   0],
       [  0,   0,   0,   0,   1]], dtype=int64)

In [163]:
y_mapping
# list(range(7))

{'White': 0,
 'Grey': 1,
 'Black': 2,
 'Silver': 3,
 'Beige': 4,
 'Bronze,White': 5,
 'Black,Silver': 6}

In [169]:
print(f'White accuracy: {361/(361+11+1):.1%}')
print(f'Grey accuracy: {56/(56+5):.1%}')
print(f'Black accuracy: {7/(7+2):.1%}')
print(f'Silver accuracy: {0/1:.1%}')
print(f'Bronze,Whilte: {1/1:.1%}')

White accuracy: 96.8%
Grey accuracy: 91.8%
Black accuracy: 77.8%
Silver accuracy: 0.0%
Bronze,Whilte: 100.0%


In [179]:
cross_val_score(clf, X_cols, y_cols, cv=2)

array([0.95275591, 0.96058559])

In [84]:
cols = ["Height.1464", "Width.1649", "Depth.1650", "Maximum data transfer rate.1165"]
category_data_xgboost_train[cols] = category_data_xgboost_train[cols].astype(float)
category_data_xgboost_test[cols] = category_data_xgboost_test[cols].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data_xgboost_test[cols] = category_data_xgboost_test[cols].astype(float)


Sources:

https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/

https://medium.com/swlh/impute-missing-values-the-right-way-c63735fccccd

In [28]:
category_data_xgboost_train.groupby(["name"]).filter(lambda x: len(x) > 5)

Unnamed: 0,name,Maximum data transfer rate.1165,Product colour.1766
0,Aironet 1550,1000.0,White
1,Aironet 1550,1000.0,White
4,Aironet 1572EAC,1300.0,Grey
7,Aironet 2802i,5200.0,White
12,Aironet 1815w,1000.0,White
...,...,...,...
2225,Aironet 1550,300.0,White
2227,Aironet 1600,300.0,Grey
2228,Aironet 3802i,5200.0,White
2230,Aironet 1700,1000.0,White


In [43]:
tst = category_data[category_data["name"] == "Aironet 1572EAC"]
tst.drop(columns=["index", "id"]).drop_duplicates().to_csv('output.csv', index=False)


In [19]:
category_data_xgboost_train = pd.get_dummies(category_data_xgboost_train.drop(columns=["name", "Maximum data transfer rate.1165"])) \
                                    .join(other=category_data_xgboost_train[["name", "Maximum data transfer rate.1165"]])

model = XGBRegressor()
model.fit(train_factorized.drop("Product colour.1766", axis=1), train_factorized["Product colour.1766"])
pred_col = model.predict(test_factorized.drop("Product colour.1766", axis=1))

Unnamed: 0,Product colour.1766_Black,"Product colour.1766_Black,Grey","Product colour.1766_Black,Silver","Product colour.1766_Bronze,White","Product colour.1766_Gold,White",Product colour.1766_Grey,"Product colour.1766_Grey,Metallic",Product colour.1766_Silver,Product colour.1766_White,name,Maximum data transfer rate.1165
0,0,0,0,0,0,0,0,0,1,Aironet 1550,1000.0
1,0,0,0,0,0,0,0,0,1,Aironet 1550,1000.0
2,0,0,0,0,0,0,0,0,1,NWA1123ACv3,866.0
3,0,0,0,0,0,1,0,0,0,C9105AXW-Q,
14,0,0,0,0,0,0,0,0,1,Aironet 2602i,1000.0
...,...,...,...,...,...,...,...,...,...,...,...
2228,0,0,0,0,0,0,0,0,1,Aironet 3802i,5200.0
2231,0,0,0,0,0,0,0,0,1,WAP125,867.0
2233,0,0,0,0,0,0,0,0,1,Aironet 3700i,1300.0
2234,0,0,0,0,0,0,0,0,1,Aironet 3600e,1000.0


In [51]:
def factorize(df):
    df_copy = df.copy()
    for col in df_copy:
        unique_items = df[col].dropna().unique().tolist()
        mapping_dict = {k:v for v, k in enumerate(unique_items)}
        df_copy[col] = df_copy[col].map(mapping_dict)
    return df_copy

train_factorized = factorize(category_data_xgboost_train)
test_factorized = factorize(category_data_xgboost_test)

In [75]:
test_factorized["name"][4]

1

In [79]:
model = XGBRegressor()
model.fit(train_factorized.drop("Product colour.1766", axis=1), train_factorized["Product colour.1766"])
pred_col = model.predict(test_factorized.drop("Product colour.1766", axis=1))
pred_col.astype(int)

unique_items = category_data_xgboost_train["Product colour.1766"].dropna().unique().tolist()
mapping_dict = {k:v for v, k in enumerate(unique_items)}
encoding_map = {v:k for k,v in mapping_dict.items()}

category_data_xgboost_prod_col_rm_copy = category_data_xgboost_prod_col_rm.copy()
category_data_xgboost_prod_col_rm_copy["Product colour.1766"][category_data_xgboost_col_nan_idx] = pd.Series(pred_col.astype(int)).map(encoding_map)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data_xgboost_prod_col_rm_copy["Product colour.1766"][category_data_xgboost_col_nan_idx] = pd.Series(pred_col.astype(int)).map(encoding_map)


In [93]:
display(category_data_xgboost_prod_col_rm_copy[category_data_xgboost_prod_col_rm_copy.index.isin(category_data_xgboost_col_nan_idx)])
display(category_data_xgboost[category_data_xgboost.index.isin(category_data_xgboost_col_nan_idx)])

merged = category_data_xgboost_prod_col_rm_copy[category_data_xgboost_prod_col_rm_copy.index.isin(category_data_xgboost_col_nan_idx)].join(category_data_xgboost[category_data_xgboost.index.isin(category_data_xgboost_col_nan_idx)], how='inner', rsuffix='t2')
merged["acc"] = merged["Product colour.1766"] == merged["Product colour.1766t2"]
merged["acc"] = merged["acc"].apply(lambda x: int(x))
merged["acc"].sum()

Unnamed: 0,name,Maximum data transfer rate.1165,Product colour.1766
0,Aironet 1550,1000.0,White
4,Aironet 1572EAC,1300.0,White
7,Aironet 2802i,5200.0,White
8,Aironet 3502I,300.0,White
9,WNDAP360,1000.0,White
...,...,...,...
2227,Aironet 1600,300.0,White
2228,Aironet 3802i,5200.0,White
2229,WNAP210,100.0,White
2232,Aironet 2602E,450.0,White


Unnamed: 0,name,Maximum data transfer rate.1165,Product colour.1766
0,Aironet 1550,1000.0,White
4,Aironet 1572EAC,1300.0,Grey
7,Aironet 2802i,5200.0,White
8,Aironet 3502I,300.0,
9,WNDAP360,1000.0,
...,...,...,...
2227,Aironet 1600,300.0,Grey
2228,Aironet 3802i,5200.0,White
2229,WNAP210,100.0,
2232,Aironet 2602E,450.0,


368

In [51]:
tree = ET.parse('IceCat Specifications/CategoriesList.xml')
root = tree.getroot()

In [52]:
# Get all categories from specification
category_parent_list = []
for category in root[0][0].findall('Category'):
    cat_id = category.attrib
    parent_id = category.find('ParentCategory').attrib
    category_parent_list.append((int(cat_id["ID"]), int(parent_id["ID"])))

# Construct tree of categories from specification
all_nodes = {n[0]: {} for n in category_parent_list}
tree = {}
for item in category_parent_list:
    id, parent = item
    if parent != 1:
        all_nodes[parent][id] = all_nodes[id]
    else:
        tree[id] = all_nodes[id]




In [53]:
# Get categories from provided feather files
feather_file_names = os.listdir("IceCat_Cat_2833_feather")
feather_categories = [int(re.search(r'[0-9]+', file_name).group(0)) for file_name in feather_file_names]


def dict_key_filter(obj, obj_filter):
    '''
    Filters dictionary to only include `obj` keys that are included in `obj_filter`. 
    https://stackoverflow.com/questions/31710271/how-to-filter-by-keys-through-a-nested-dictionary-in-a-pythonic-way
    '''
    def inner_dict_key_filter(obj): return dict_key_filter(obj, obj_filter)
    def to_keep(subtree): return not isinstance(subtree, (dict, list)) or subtree

    def build_subtree(key, value):
        if key in obj_filter:
            return copy.deepcopy(value) # keep the branch
        elif isinstance(value, (dict, list)):
            return inner_dict_key_filter(value) # continue to search
        return [] # just an orphan value here

    if isinstance(obj, dict):
        key_subtree_pairs = ((key, build_subtree(key, value)) for key, value in obj.items())
        return {key:subtree for key, subtree in key_subtree_pairs if to_keep(subtree)}
    elif isinstance(obj, list):
        return list(filter(to_keep, map(inner_dict_key_filter, obj)))
    return []


filtered_tree = dict_key_filter(tree, feather_categories)




In [1]:
filtered_tree

NameError: name 'filtered_tree' is not defined

In [60]:

def flatten(d):    
    res = []
    if isinstance(d, dict):
        for key, val in d.items():
            res.append(key)
            res.extend(flatten(val))
    return res

collapsed_cat_filtered_tree = flatten(filtered_tree)
len(collapsed_cat_filtered_tree)

562

In [84]:
filtered_tree[2833][2][55][60]

{}

In [85]:
cpy = copy.copy(collapsed_cat_filtered_tree)
cpy.sort()
len(cpy)

562

In [92]:
level_one_count = 0
level_two_count = 0
level_three_count = 0
level_four_count = 0
level_one_items  = []
level_two_items = []
level_three_items = []
level_four_items = []
for level_one in filtered_tree.keys():
    level_one_count += 1
    level_one_items.append(level_one)
    for level_two in filtered_tree[level_one].keys():
        level_two_count += 1
        level_two_items.append(level_two)
        for level_three in filtered_tree[level_one][level_two].keys():
            level_three_items.append(level_three)
            level_three_count += 1
            for level_four in filtered_tree[level_one][level_two][level_three].keys():
                level_four_count += 1
                level_four_items.append(level_four)



In [88]:
[level_one_count, level_two_count, level_three_count, level_four_count]

[1, 11, 212, 338]

In [91]:
level_three_items

[7,
 12,
 21,
 26,
 27,
 28,
 38,
 42,
 55,
 62,
 67,
 803,
 836,
 885,
 893,
 904,
 1534,
 1535,
 1552,
 1624,
 1646,
 1657,
 1719,
 1753,
 2444,
 2841,
 2842,
 2843,
 3912,
 5210,
 5230,
 5231,
 5640,
 7100,
 7241,
 9225,
 166,
 178,
 189,
 236,
 998,
 1291,
 1762,
 2702,
 2804,
 2844,
 2901,
 5984,
 6892,
 8839,
 8917,
 151,
 153,
 154,
 155,
 156,
 159,
 203,
 896,
 897,
 1062,
 1389,
 1508,
 1573,
 2093,
 2282,
 2774,
 2775,
 3319,
 3625,
 5022,
 5400,
 5459,
 5467,
 5632,
 5831,
 6946,
 7284,
 7291,
 7319,
 7396,
 8179,
 8189,
 8194,
 8231,
 8355,
 8738,
 8909,
 8910,
 8919,
 8921,
 9228,
 192,
 194,
 195,
 196,
 198,
 202,
 204,
 282,
 1041,
 1297,
 1540,
 2675,
 2813,
 2929,
 3955,
 8154,
 8470,
 8500,
 8603,
 8685,
 287,
 2840,
 8763,
 8764,
 373,
 692,
 1305,
 4972,
 4973,
 8653,
 2503,
 8752,
 8753,
 8754,
 8755,
 8756,
 8757,
 8758,
 867,
 869,
 883,
 952,
 953,
 956,
 1121,
 1158,
 1303,
 1306,
 1493,
 1494,
 1495,
 1501,
 1569,
 1574,
 1582,
 1614,
 1638,
 1639,
 1663,
 1

In [46]:
def flatten(d):    
    res = []  # Result list
    if isinstance(d, dict):
        for key, val in d.items():
            res.append(key)
            res.extend(flatten(val))
    # elif isinstance(d, list):
    #     res = d        
    # else:
    #     raise TypeError("Undefined type for flatten: %s"%type(d))

    return res


dict1 = {
    'Bob': {
        'shepherd': [4, 6, 3],
        'collie': [23, 3, 45],
        'poodle': [2, 0, 6],
    },
    'Sarah': {
        'shepherd': [1, 2, 3],
        'collie': [3, 31, 4],
        'poodle': [21, 5, 6],
    },
    'Ann': {
        'shepherd': [4, 6, 3],
        'collie': [23, 3, 45],
        'poodle': [2, 10, 8],
    }
}

print( flatten(dict1) )

['Bob', 'shepherd', 'collie', 'poodle', 'Sarah', 'shepherd', 'collie', 'poodle', 'Ann', 'shepherd', 'collie', 'poodle']


In [19]:
level_two_items

[2, 106, 150, 191, 206, 220, 225, 242, 830, 839, 2557]

In [13]:
len(res[2833])


NameError: name 'res' is not defined

In [None]:
st ='frame_IceCat_Category_1007.feather'
re.search(r'[0-9]+', st).group(0)

'1007'