In [1]:
import os
from pathlib2 import Path
import re
import itertools
import copy
import numpy as np
from IPython.display import display_html 
import random


import pyarrow.feather as feather
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.ensemble import (GradientBoostingRegressor, GradientBoostingClassifier)
import miceforest as mf

from utils import *


## 1. Exploratory Data Analysis

This analysis is primarily focused on Categroy 909 to get an understanding of the dataset and device an algorithmic approach for missing data. Learnings from here will be expanded to other categories, however not all as some work might only be applicable to this category.

In [2]:
category_data = feather.read_feather(Path('IceCat_Cat_2833_feather/frame_IceCat_Category_909.feather'))
display(category_data.tail())
display(category_data.info())

Unnamed: 0,level_0,index,id,name,category_id,category_label,Operating temperature (T-T).1112,Operating temperature (T-T).1112.unit,Maximum data transfer rate.1165,Maximum data transfer rate.1165.unit,...,Compatibility.890,Compatibility.890.unit,Interface type.990,Interface type.990.unit,Firewall security.1612,Firewall security.1612.unit,Works with the Google Assistant.36516,Works with the Google Assistant.36516.unit,RTS/CTS threshold.22398,RTS/CTS threshold.22398.unit
2232,2232,0,IceCat_Prod_16048998,Aironet 2602E,IceCat_Category_909,Wireless Access Points,-20 - 55,°C,450.0,Mbit/s,...,,,,,,,,,,
2233,2233,0,IceCat_Prod_31985083,Aironet 3700i,IceCat_Category_909,Wireless Access Points,0 - 40,°C,1300.0,Mbit/s,...,,,,,,,,,,
2234,2234,0,IceCat_Prod_33721921,Aironet 3600e,IceCat_Category_909,Wireless Access Points,-20 - 55,°C,1000.0,Mbit/s,...,,,,,,,,,,
2235,2235,0,IceCat_Prod_11147703,NWA5550-N,IceCat_Category_909,Wireless Access Points,-40 - 60,°C,300.0,Mbit/s,...,,,,,,,,,,
2236,2236,0,IceCat_Prod_36197901,Aironet 1562E,IceCat_Category_909,Wireless Access Points,-40 - 65,°C,1300.0,Mbit/s,...,,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Columns: 540 entries, level_0 to RTS/CTS threshold.22398.unit
dtypes: boolean(88), float64(69), int64(2), object(381)
memory usage: 8.1+ MB


None

###  1.1 Data Pruning

The dataset has several columns that have missing data. A reasonable threshold is therefore necessary to select columns that have a substantially complete set of values. Given that 100% would result in only product reference columns, 80% was selected which resulted in 29 columns:

In [3]:
pct_complete_threshold = .8
pct_complete = (len(category_data) - category_data.isnull().sum()) / len(category_data)
missing_vals = pd.DataFrame({'col': category_data.columns,
                             'pct_complete': pct_complete})
num_cols_above_null_threshold = missing_vals[missing_vals['pct_complete'] >= (pct_complete_threshold)].count()

cols_of_interest = missing_vals[missing_vals['pct_complete'] >= pct_complete_threshold].iloc[:,0]

print(f"Columns above {pct_complete_threshold:.0%} percent complete threshold ({num_cols_above_null_threshold[0]}):")
display(cols_of_interest.values)

Columns above 80% percent complete threshold (29):


array(['level_0', 'index', 'id', 'name', 'category_id', 'category_label',
       'Operating temperature (T-T).1112',
       'Operating temperature (T-T).1112.unit',
       'Maximum data transfer rate.1165',
       'Maximum data transfer rate.1165.unit', 'Height.1464',
       'Height.1464.unit', 'Width.1649', 'Width.1649.unit', 'Depth.1650',
       'Depth.1650.unit', 'Product colour.1766',
       'Networking standards.1802', '2.4 GHz.20806', '5 GHz.20807',
       'Ethernet LAN (RJ-45) ports.2312', 'Ethernet LAN data rates.3768',
       'Security algorithms.454', 'Storage temperature (T-T).757',
       'Storage temperature (T-T).757.unit', 'Weight.94',
       'Weight.94.unit', 'Operating relative humidity (H-H).703',
       'Operating relative humidity (H-H).703.unit'], dtype=object)

Columns can be broken into two buckets:
- *Unit based columns*: these will be filled based on allowable values based on *CategoryFeatureslist.xml* file as filled values are consistent within columns
- *Non-unit based columns*: these will be further investigated for possible inclusion in imputation analysis

In [4]:
cols_units = [str for str in cols_of_interest.values if ".unit" in str]
cols_sys = ["level_0", "index", "id", "category_id", "category_label"]
cols_non_units = set(cols_of_interest.values) - set(cols_units) - set(cols_sys)
print(f"Unit based columns ({len(cols_units)}):")
display(cols_units)
print(f"System reference columns ({len(cols_sys)}):")
display(cols_sys)
print(f"Non-unit based columns ({len(cols_non_units)}):")
display(list(cols_non_units))

Unit based columns (8):


['Operating temperature (T-T).1112.unit',
 'Maximum data transfer rate.1165.unit',
 'Height.1464.unit',
 'Width.1649.unit',
 'Depth.1650.unit',
 'Storage temperature (T-T).757.unit',
 'Weight.94.unit',
 'Operating relative humidity (H-H).703.unit']

System reference columns (5):


['level_0', 'index', 'id', 'category_id', 'category_label']

Non-unit based columns (16):


['Ethernet LAN (RJ-45) ports.2312',
 'Storage temperature (T-T).757',
 'Maximum data transfer rate.1165',
 'Operating temperature (T-T).1112',
 'Weight.94',
 'Product colour.1766',
 'Width.1649',
 'Operating relative humidity (H-H).703',
 'Height.1464',
 'Security algorithms.454',
 'name',
 '2.4 GHz.20806',
 'Networking standards.1802',
 '5 GHz.20807',
 'Ethernet LAN data rates.3768',
 'Depth.1650']

When we look at the uniqueness of each column, two columns (`2.4 GHz.20806` & `5 GHz.20807`) stand out as having only one value throughout each column (other than null) and are therefore removed:

In [5]:
uniqueness = category_data[list(cols_non_units)].apply(pd.Series.nunique).sort_values()
print("Variation within each column:")
display(uniqueness)
low_variation_cols = list(uniqueness[uniqueness == 1].index)
print(f'Columns with no variation: {low_variation_cols}')
cols = [col for col in list(cols_non_units) if col not in low_variation_cols]
print(f'\nColumns to be considered:')
display(cols)

Variation within each column:


2.4 GHz.20806                              1
5 GHz.20807                                1
Ethernet LAN (RJ-45) ports.2312            8
Product colour.1766                       12
Ethernet LAN data rates.3768              18
Operating relative humidity (H-H).703     27
Storage temperature (T-T).757             31
Operating temperature (T-T).1112          33
Maximum data transfer rate.1165           62
Width.1649                               186
Height.1464                              195
Depth.1650                               214
Weight.94                                233
Networking standards.1802                365
Security algorithms.454                  422
name                                     787
dtype: int64

Columns with no variation: ['2.4 GHz.20806', '5 GHz.20807']

Columns to be considered:


['Ethernet LAN (RJ-45) ports.2312',
 'Storage temperature (T-T).757',
 'Maximum data transfer rate.1165',
 'Operating temperature (T-T).1112',
 'Weight.94',
 'Product colour.1766',
 'Width.1649',
 'Operating relative humidity (H-H).703',
 'Height.1464',
 'Security algorithms.454',
 'name',
 'Networking standards.1802',
 'Ethernet LAN data rates.3768',
 'Depth.1650']

### One Hot Encoding

Below is a preview of the data thus far after dropping observations w/ null values (maybe consider including in next iteration?). Note that there are columns with several attributes within a cell. For example, for 1st observation below the `Networking standards.1802` attribute includes several values separated by a comma: `IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 802.11n,IEEE 802.3af`. To appropriately capture the attributes for each observation, dummy variables will need to be created. 

In [6]:
pruned_data = category_data[cols].dropna()
pruned_data.head()

Unnamed: 0,Ethernet LAN (RJ-45) ports.2312,Storage temperature (T-T).757,Maximum data transfer rate.1165,Operating temperature (T-T).1112,Weight.94,Product colour.1766,Width.1649,Operating relative humidity (H-H).703,Height.1464,Security algorithms.454,name,Networking standards.1802,Ethernet LAN data rates.3768,Depth.1650
6,2.0,-30 - 70,1000.0,0 - 40,480.0,White,177.6,10 - 90,50.4,"EAP,EAP-SIM,EAP-TLS,EAP-TTLS,PEAP,TKIP,WPA,WPA2",Aironet 702i,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 80...",101001000,177.6
7,2.0,-30 - 70,5200.0,-20 - 50,1600.0,White,220.0,10 - 90,55.1,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",Aironet 2802i,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",1001000,220.4
10,2.0,-30 - 70,5200.0,-20 - 50,1600.0,White,220.0,10 - 90,55.1,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",Aironet 2802i,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",1001000,220.4
12,3.0,-30 - 70,1000.0,0 - 40,280.0,White,89.0,10 - 90,31.5,"802.1x RADIUS,AES,EAP,EAP-FAST,EAP-PEAP,EAP-SI...",Aironet 1815w,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",101001000,140.0
14,1.0,-30 - 70,1000.0,0 - 40,1040.0,White,221.0,10 - 90,221.0,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA...",Aironet 2602i,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11d,IEEE 80...",101001000,54.0


The columns identified that would need dummy variables created are:
- `Product colour.1766`
- `Networking standards.1802`
- `Ethernet LAN data rates.3768 `
- `Security algorithms.454` 

In [7]:
cols_need_dummies = ["Product colour.1766", "Networking standards.1802", "Ethernet LAN data rates.3768", "Security algorithms.454"]
pd.DataFrame({"col": cols_need_dummies, 
              "dummy_var_count": [pruned_data[col].str.get_dummies(",").shape[1] for col in cols_need_dummies]})

Unnamed: 0,col,dummy_var_count
0,Product colour.1766,6
1,Networking standards.1802,40
2,Ethernet LAN data rates.3768,6
3,Security algorithms.454,48


In [8]:
xgdata = pd.concat([pruned_data, 
                    pruned_data["Product colour.1766"].str.get_dummies(","),
                    pruned_data["Networking standards.1802"].str.get_dummies(","),
                    pruned_data["Ethernet LAN data rates.3768"].str.get_dummies(","),
                    pruned_data["Security algorithms.454"].str.get_dummies(",")],
                    axis=1) \
            .drop(columns=cols_need_dummies)

print(f'The new dataset is now {xgdata.shape[0]:,} by {xgdata.shape[1]}.\n\nBelow is a preview:')

The new dataset is now 1,316 by 110.

Below is a preview:


In [9]:
xgdata

Unnamed: 0,Ethernet LAN (RJ-45) ports.2312,Storage temperature (T-T).757,Maximum data transfer rate.1165,Operating temperature (T-T).1112,Weight.94,Width.1649,Operating relative humidity (H-H).703,Height.1464,name,Depth.1650,...,WPA2,WPA2-AES,WPA2-CCMP,WPA2-Enterprise,WPA2-PSK,WPA2-TKIP,WPA3,WPA3-Enterprise,WPA3-PSK,WPS
6,2.0,-30 - 70,1000.0,0 - 40,480.0,177.6,10 - 90,50.4,Aironet 702i,177.6,...,1,0,0,0,0,0,0,0,0,0
7,2.0,-30 - 70,5200.0,-20 - 50,1600.0,220.0,10 - 90,55.1,Aironet 2802i,220.4,...,1,0,0,0,0,0,0,0,0,0
10,2.0,-30 - 70,5200.0,-20 - 50,1600.0,220.0,10 - 90,55.1,Aironet 2802i,220.4,...,1,0,0,0,0,0,0,0,0,0
12,3.0,-30 - 70,1000.0,0 - 40,280.0,89.0,10 - 90,31.5,Aironet 1815w,140.0,...,1,0,0,0,0,0,0,0,0,0
14,1.0,-30 - 70,1000.0,0 - 40,1040.0,221.0,10 - 90,221.0,Aironet 2602i,54.0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2228,2.0,-30 - 70,5200.0,0 - 40,2090.0,220.0,10 - 90,62.5,Aironet 3802i,220.0,...,1,0,0,0,0,0,0,0,0,0
2230,3.0,-30 - 70,1000.0,0 - 40,1000.0,221.0,10 - 90,51.0,Aironet 1700,221.0,...,1,0,0,0,0,0,0,0,0,0
2231,1.0,-20 - 70,867.0,0 - 40,303.0,123.0,10 - 85,31.0,WAP125,123.0,...,1,0,0,0,0,0,0,0,0,0
2233,2.0,-30 - 70,1300.0,0 - 40,1130.0,221.0,10 - 90,54.0,Aironet 3700i,221.0,...,0,0,0,0,0,0,0,0,0,0


Dummy variables will also need to be created on the below categorical columns:

The dataset is now ready for further analysis for imputation techniques

## 2. Data Imputation

## Mice Forest

Attempting to use MICE Forest doesn't yield good results as there are several instances of rare categories for `Storage temperature (T-T).757`, `Operating relative humidity (H-H).703`,`Operating temperature (T-T).1112`. As such we use the weighted distribution to impute for these columns:

In [10]:
xgdata_mice_forest = xgdata.drop(columns=["name"]) # Name has high cardinality & is more of a reference

In [11]:
xgdata_mice_forest_amp = remove_data(xgdata_mice_forest, 0.1)
xgdata_mice_forest_amp[xgdata_mice_forest_amp.select_dtypes(["object"]).columns] = xgdata_mice_forest_amp.select_dtypes(["object"]).apply(lambda x: x.astype("category"))

Replace NAs within categorical columns with selections based on weighted distribution of values in complete dataset:

In [12]:
cols_to_impute = ["Storage temperature (T-T).757", "Operating relative humidity (H-H).703","Operating temperature (T-T).1112"]

na_idx_by_col = {}

for col in cols_to_impute:
    col_vals = xgdata_mice_forest_amp[col]
    col_vals_na = col_vals[col_vals.isna()]
    na_idx_by_col[col] = list(col_vals_na.index)
    xgdata_mice_forest_amp[col].loc[list(col_vals_na.index)] = impute_cat_weighted_dist(xgdata_mice_forest, xgdata_mice_forest_amp, col)

TypeError: Cannot setitem on a Categorical with a new category, set the categories first

In [None]:
cols_accuracy = {}

for col in cols_to_impute:
    xgdata_mice_actual = xgdata_mice_forest[col].reset_index(drop=True).loc[na_idx_by_col[col]]
    xgdata_mice_imputed = xgdata_mice_forest_amp[col].loc[na_idx_by_col[col]]
    acc = sum(1 for x, y in zip(xgdata_mice_actual, xgdata_mice_imputed) if x == y) / float(len(xgdata_mice_actual))
    cols_accuracy[col] = acc

In [None]:
cols_accuracy

{'Storage temperature (T-T).757': 0.7022900763358778,
 'Operating relative humidity (H-H).703': 0.7175572519083969,
 'Operating temperature (T-T).1112': 0.40458015267175573}

And now we run the MICE Forest algorithm:

In [None]:
kds = mf.ImputationKernel(
  xgdata_mice_forest_amp,
  save_all_iterations=True,
  random_state=1991
)

# # Run the MICE algorithm for 2 iterations
kds.mice(2)

# # Return the completed dataset.
xgdata_complete = kds.complete_data()

  warn(


Accuracy on all columns

In [None]:
# get_na_idx_by_col(xgdata_mice_forest_amp)
# xgdata_mice_forest_amp[xgdata_mice_forest_amp["Maximum data transfer rate.1165"].isnull()]
acc_by_col = get_imp_cols_accuracy(xgdata_mice_forest, xgdata_mice_forest_amp, xgdata_complete)
acc_by_col

{'Ethernet LAN (RJ-45) ports.2312': 0.916030534351145,
 'Maximum data transfer rate.1165': 0.8396946564885496,
 'Depth.1650': 0.7633587786259542,
 'Width.1649': 0.8091603053435115,
 'Weight.94': 0.7480916030534351,
 'Height.1464': 0.6335877862595419,
 'Black': 1.0,
 'Bronze': 1.0,
 'Gold': 0.9923664122137404,
 'Grey': 0.9847328244274809,
 'Silver': 1.0,
 'White': 0.9923664122137404,
 'IEEE 802.11a': 0.9847328244274809,
 'IEEE 802.11ac': 0.9618320610687023,
 'IEEE 802.11ad': 1.0,
 'IEEE 802.11ax': 1.0,
 'IEEE 802.11az': 1.0,
 'IEEE 802.11b': 0.9847328244274809,
 'IEEE 802.11d': 0.9541984732824428,
 'IEEE 802.11e': 0.9847328244274809,
 'IEEE 802.11g': 0.9694656488549618,
 'IEEE 802.11h': 0.9618320610687023,
 'IEEE 802.11i': 0.9847328244274809,
 'IEEE 802.11k': 0.9923664122137404,
 'IEEE 802.11n': 0.9923664122137404,
 'IEEE 802.11r': 0.9847328244274809,
 'IEEE 802.11u': 1.0,
 'IEEE 802.11v': 1.0,
 'IEEE 802.11w': 1.0,
 'IEEE 802.15.4': 1.0,
 'IEEE 802.1AX': 0.9923664122137404,
 'IEEE 802.

Reconstruction dataset back to original form 

In [20]:
complete_reconstructed = undummify_cols(pruned_data, xgdata_complete, cols_need_dummies)
complete_reconstructed

Unnamed: 0,index,Operating temperature (T-T).1112,Ethernet LAN (RJ-45) ports.2312,Weight.94,Width.1649,Maximum data transfer rate.1165,Operating relative humidity (H-H).703,Depth.1650,Storage temperature (T-T).757,Height.1464,Product colour.1766_undummified,Networking standards.1802_undummified,Ethernet LAN data rates.3768_undummified,Security algorithms.454_undummified
0,6,0 - 40,2.0,300.0,177.6,1000.0,10 - 90,177.6,-20 - 70,50.4,White,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 80...",101001000,"EAP,EAP-SIM,EAP-TLS,EAP-TTLS,PEAP,TKIP,WPA,WPA2"
1,7,-20 - 50,2.0,1600.0,220.0,5200.0,10 - 90,220.4,-30 - 70,55.1,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",1001000,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA..."
2,10,-20 - 50,2.0,1600.0,220.0,5200.0,10 - 90,220.4,-30 - 70,55.1,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",1001000,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA..."
3,12,0 - 40,3.0,280.0,89.0,1000.0,10 - 90,140.0,-30 - 70,31.5,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",101001000,"802.1x RADIUS,AES,EAP,EAP-FAST,EAP-PEAP,EAP-SI..."
4,14,0 - 40,1.0,1000.0,221.0,1000.0,10 - 90,51.0,-30 - 70,221.0,White,"IEEE 802.11a,IEEE 802.11b,IEEE 802.11d,IEEE 80...",101001000,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1311,2228,-40 - 55,2.0,2090.0,220.0,5200.0,10 - 90,220.0,-30 - 70,67.0,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",100100025005000,"802.1x RADIUS,AES,EAP,EAP-FAST,EAP-PEAP,EAP-SI..."
1312,2230,0 - 40,3.0,1000.0,221.0,1000.0,10 - 90,221.0,-30 - 70,51.0,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11b,IEEE 8...",101001000,"802.1x RADIUS,AES,EAP,EAP-FAST,EAP-PEAP,EAP-SI..."
1313,2231,0 - 40,1.0,303.0,123.0,867.0,10 - 85,130.0,-20 - 70,31.0,White,"IEEE 802.11ac,IEEE 802.11b,IEEE 802.11e,IEEE 8...",101001000,"WPA,WPA2"
1314,2233,0 - 40,2.0,1130.0,221.0,1300.0,10 - 90,221.0,-30 - 70,54.0,White,"IEEE 802.11a,IEEE 802.11ac,IEEE 802.11d,IEEE 8...",101001000,"802.1x RADIUS,AES,EAP-FAST,EAP-PEAP,EAP-SIM,EA..."


More work needs to be done to evaluate accuracy of one hot encoded features. The below code shows attempts to evaluate the columns however it's based on all rows. An algorithm needs to be created to only evaluate rows that were imputed upon.

In [21]:
imputed = complete_reconstructed
actual = pruned_data

dummy_acc = {}
for col in cols_need_dummies:
    imp_vals = pd.Series(list(imputed[col + "_undummified"]), index=list(actual[col].index))
    acc = sum(1 for a, i in zip(actual[col], imp_vals) if a == i) / float(len(actual))
    dummy_acc[col] = acc
# actual["Width.1649"].compare(imputed["Width.1649"])
# print(imputed["Width.1649"])
# print(actual["Width.1649"])

# w_imp = pd.Series(list(imputed["Networking standards.1802_undummified"]), index=list(actual["Networking standards.1802"].index))

# sum(1 for a, i in zip(actual["Networking standards.1802"], w_imp) if a == i) / float(len(actual))

dummy_acc

{'Product colour.1766': 0.993161094224924,
 'Networking standards.1802': 0.9012158054711246,
 'Ethernet LAN data rates.3768': 0.9825227963525835,
 'Security algorithms.454': 0.8731003039513677}

In [62]:
actual['Networking standards.1802'][6]

'IEEE 802.11a,IEEE 802.11b,IEEE 802.11g,IEEE 802.11h,IEEE 802.11i,IEEE 802.11n,IEEE 802.1x'

In [64]:
xgdata_complete.filter(regex="IEEE*")

Unnamed: 0,IEEE 802.11a,IEEE 802.11ac,IEEE 802.11ad,IEEE 802.11ax,IEEE 802.11az,IEEE 802.11b,IEEE 802.11d,IEEE 802.11e,IEEE 802.11g,IEEE 802.11h,...,IEEE 802.3ad,IEEE 802.3af,IEEE 802.3at,IEEE 802.3au,IEEE 802.3az,IEEE 802.3bz,IEEE 802.3i,IEEE 802.3u,IEEE 802.3x,IEEE 802.3z
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1311,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1312,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1313,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1314,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
