## NASA - Clean

##Setup


In [682]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width',1000)
pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown

DEBUG = True

##Dataset Import

In [683]:
DATASET = "NASA"

import os, sys
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Dataset

In [684]:
filename = f"{ROOT}/orig/data.csv"

if os.path.isfile(filename):
  print("Using local copy ...")
else:
  print("Downloading ...")
  df = pd.read_csv("https://setu-datamining2.github.io/live/topics/21-Assignments/01-NASA_Software_Defect_Datasets/files/pc2.csv")
  df.to_csv(filename, index = False)

df = pd.read_csv(filename)
print(df.shape)
df.head()

Using local copy ...
(5589, 37)


Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,defects
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,5.33,1.5,12.0,0.0,4.0,0.67,0.67,8.0,1.0,0.0,0.0,2.0,0.5,1.0,3.0,1.0,3.0,2.0,0.0,0.0,False
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,0.0,17.88,7.43,986.77,0.04,34.0,0.13,54.82,132.83,1.0,0.0,0.0,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,42.62,7.81,2598.31,0.11,77.0,0.13,144.35,332.79,1.0,0.0,0.0,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,33.44,0.63,13.06,0.01,9.0,1.6,0.73,20.9,1.0,0.0,0.0,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False


## Table 1 &mdash; Summary


### Base bones approach - score max 30%

* no interpretation
* no comparison with paper

In [685]:
df.shape

(5589, 37)

### Attempt 2 - score max 40%-50% 
* interpretation
* no comparison with paper

In [686]:
print("Number of cases:", df.shape[0])
print("Number of features:", df.shape[1])

Number of cases: 5589
Number of features: 37


### Attempt 3 - score max 100%
* interpretation
* comparison with paper

Also
* use fancier print statements (not done here)

### Attempt 3 - score max 100%
* interpretation
* comparison with paper

Also
* use dataframe to display results 

In [687]:
data = []
for message, expected, observed in [
    ("Numer of cases", 5589, df.shape[0]),
    ("Numer of features", 37, df.shape[1]),
]:
  data.append([message,expected,observed])

df_result = pd.DataFrame(data,columns=['Message','Expected','Observed'])
df_result.head()
# for bonus points add a style to highlight rows that do not match

Unnamed: 0,Message,Expected,Observed
0,Numer of cases,5589,5589
1,Numer of features,37,37


### Table 3&mdash; by Features

###A - Identical features

In [688]:
def table_3_A(df):
  df_tmp = df.T.duplicated()
  df_tmp = df_tmp[df_tmp]
  p_A_count = df_tmp.shape[0]
  p_A_columns = set(df_tmp.index)
  return p_A_count, p_A_columns

table_3_a_observed,table_3_a_index = table_3_A(df)

###B - Constant features

In [689]:
df.nunique()

BRANCH_COUNT                         39
CALL_PAIRS                           28
LOC_CODE_AND_COMMENT                 79
LOC_COMMENTS                         58
CONDITION_COUNT                      36
CYCLOMATIC_COMPLEXITY                27
CYCLOMATIC_DENSITY                   63
DECISION_COUNT                       23
DECISION_DENSITY                     29
DESIGN_COMPLEXITY                    22
DESIGN_DENSITY                       43
EDGE_COUNT                           85
ESSENTIAL_COMPLEXITY                 19
ESSENTIAL_DENSITY                     2
LOC_EXECUTABLE                       17
PARAMETER_COUNT                      14
HALSTEAD_CONTENT                    866
HALSTEAD_DIFFICULTY                 480
HALSTEAD_EFFORT                    1012
HALSTEAD_ERROR_EST                   87
HALSTEAD_LENGTH                     226
HALSTEAD_LEVEL                       55
HALSTEAD_PROG_TIME                  983
HALSTEAD_VOLUME                     635
MAINTENANCE_SEVERITY                 40


In [690]:
def table_3_B(df):
  df_tmp = df.nunique()
  df_tmp = df_tmp[df_tmp==1]
  problem_B_count = df_tmp.shape[0]
  problem_B_columns = set(df_tmp.index)
  return problem_B_count, problem_B_columns

table_3_b_observed, table_3_b_index = table_3_B(df)

### C - Features with missing values

In [691]:
(df.isna().sum()>0).sum()

0

In [692]:
def table_3_C(df):
  df_tmp = df.isna().sum()
  df_tmp = df_tmp[df_tmp>0]
  problem_C_count = df_tmp.shape[0]
  problem_C_columns = set(df_tmp.index)
  return problem_C_count, problem_C_columns

table_3_c_observed, table_3_c_index = table_3_C(df)

### D - Features with conflicting values

In [693]:
x = []

In [694]:
z = []

In [695]:
x.append(((df.NUMBER_OF_LINES >= df.LOC_TOTAL)==False).sum())
if x[0]:
  z.append("LOC_TOTAL")
  z.append("NUMBER_OF_LINES")

In [696]:
x.append(((df.NUMBER_OF_LINES >= df.LOC_CODE_AND_COMMENT)==False).sum())
if x[1]:
  z.append("LOC_CODE_AND_COMMENT")
  z.append("NUMBER_OF_LINES")

In [697]:
x.append(((df.NUMBER_OF_LINES >= df.LOC_COMMENTS)==False).sum())
if x[2]:
  z.append("LOC_COMMENTS")
  z.append("NUMBER_OF_LINES")

In [698]:
x.append(((df.NUMBER_OF_LINES >= df.LOC_EXECUTABLE)==False).sum())
if x[3]:
  z.append("LOC_EXECUTABLE")
  z.append("NUMBER_OF_LINES")

In [699]:
x.append(((df.LOC_TOTAL >= df.LOC_EXECUTABLE)==False).sum())
if x[4]:
  z.append("LOC_TOTAL")
  z.append("LOC_EXECUTABLE")

In [700]:
x.append(((df.LOC_TOTAL >= df.LOC_CODE_AND_COMMENT)==False).sum())
if x[5]:
  z.append("LOC_TOTAL")
  z.append("LOC_CODE_AND_COMMENT")

In [701]:
x.append(((df.NUM_OPERANDS >= df.NUM_UNIQUE_OPERANDS)==False).sum())
if x[6]:
  z.append("NUM_OPERANDS")
  z.append("NUM_UNIQUE_OPERANDS")

In [702]:
x.append(((df.NUM_OPERATORS >= df.NUM_UNIQUE_OPERATORS)==False).sum())
if x[7]:
  z.append("NUM_OPERATORS")
  z.append("NUM_UNIQUE_OPERATORS")

In [703]:
x.append(((df.HALSTEAD_LENGTH == df.NUM_OPERATORS + df.NUM_OPERANDS)==False).sum())
if x[8]:
  z.append("HALSTEAD_LENGTH")
  z.append("NUM_OPERATORS")
  z.append("NUM_OPERANDS")

In [704]:
x.append(((df.CYCLOMATIC_COMPLEXITY <= df.NUM_OPERATORS + 1) == False).sum())
if x[9]:
  z.append("CYCLOMATIC_COMPLEXITY")
  z.append("NUM_OPERATORS")

In [705]:
x.append(((((df.CALL_PAIRS <= df.NUM_OPERATORS)==False).sum())>0).sum())
if x[10]:
  z.append("CALL_PAIRS")
  z.append("NUM_OPERATORS")

In [706]:
y = (df.NUM_OPERATORS + df.NUM_OPERANDS)*np.log2(df.NUM_UNIQUE_OPERATORS + df.NUM_UNIQUE_OPERANDS)
y = y.replace(np.nan,0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [707]:
x.append((~np.isclose(df.HALSTEAD_VOLUME, y,atol=1)).sum())
if x[11]:
  z.append("HALSTEAD_VOLUME")
  z.append("NUM_OPERATORS")
  z.append("NUM_OPERANDS")
  z.append("NUM_UNIQUE_OPERATORS")
  z.append("NUM_UNIQUE_OPERANDS")

In [708]:
y = (2/df.NUM_UNIQUE_OPERATORS)*(df.NUM_UNIQUE_OPERANDS/df.NUM_OPERANDS)
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [709]:
x.append((~np.isclose(df.HALSTEAD_LEVEL, y,atol=1)).sum())
if x[12]:
  z.append("HALSTEAD_LEVEL")
  z.append("NUM_UNIQUE_OPERATORS")
  z.append("NUM_UNIQUE_OPERANDS")
  z.append("NUM_OPERANDS")

In [710]:
y = (df.NUM_UNIQUE_OPERATORS/2)*(df.NUM_OPERANDS/df.NUM_UNIQUE_OPERANDS)
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [711]:
x.append((~np.isclose(df.HALSTEAD_DIFFICULTY, y,atol=1)).sum())
if x[13]:
  z.append("NUM_OPERANDS")
  z.append("NUM_UNIQUE_OPERATORS")
  z.append("NUM_UNIQUE_OPERANDS")
  z.append("HALSTEAD_DIFFICULTY")

In [712]:
y = df.HALSTEAD_VOLUME/df.HALSTEAD_DIFFICULTY
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [713]:
x.append((~np.isclose(df.HALSTEAD_CONTENT, y,atol=1)).sum())
if x[14]:
  z.append("HALSTEAD_CONTENT")
  z.append("HALSTEAD_VOLUME")
  z.append("HALSTEAD_DIFFICULTY")

In [714]:
y = df.HALSTEAD_EFFORT/df.HALSTEAD_VOLUME
y = y.replace(np.nan,0)

In [715]:
x.append((~np.isclose(y,df.HALSTEAD_DIFFICULTY,atol=1)).sum())
if x[15]:
  z.append("HALSTEAD_DIFFICULTY")
  z.append("HALSTEAD_EFFORT")
  z.append("HALSTEAD_VOLUME")

In [716]:
y = (df.HALSTEAD_EFFORT/18)

In [717]:
x.append((~np.isclose(df.HALSTEAD_PROG_TIME, y,atol=1)).sum())
if x[16]:
  z.append("HALSTEAD_PROG_TIME")
  z.append("HALSTEAD_EFFORT")

In [718]:
def table_3_D(df):
  problem_D_count = len(set(z))
  problem_D_columns = set(z)
  return problem_D_count, problem_D_columns
table_3_d_observed, table_3_d_index = table_3_D(df)

### E - Features with implausible values

In [719]:
a = (((df.LOC_TOTAL == 0).sum())>0).sum()
a

1

In [720]:
b = ((df<0).sum()>0).sum()
b

0

In [721]:
count_columns = []
for c in list(df.columns):
  if "COUNT" in c:
    count_columns.append(c)

count_columns

['BRANCH_COUNT',
 'CONDITION_COUNT',
 'DECISION_COUNT',
 'EDGE_COUNT',
 'PARAMETER_COUNT',
 'MODIFIED_CONDITION_COUNT',
 'MULTIPLE_CONDITION_COUNT',
 'NODE_COUNT']

In [722]:
df_tmp = df[count_columns]
df_tmp_integer = df_tmp.select_dtypes(['int','float']).columns
non_integer_count = df_tmp.shape[1] - len(df_tmp_integer)
non_integer_index = set(df_tmp_integer)

In [723]:
def table_3_E(df):
  df_tmp1 = (df.LOC_TOTAL == 0).sum()
  df_tmp1 = df_tmp1[df_tmp1>0]
  problem_E1_count = df_tmp1.shape[0]
  problem_E1_columns = "LOC_TOTAL"
  df_tmp2 = (df<0).sum()
  df_tmp2 = df_tmp2[df_tmp2>0]
  problem_E2_count = df_tmp2.shape[0]
  problem_E2_columns = set(df_tmp2.index)
  c = non_integer_count
  return (problem_E1_count+problem_E2_count+c)

table_3_e_observed = table_3_E(df)

###F - Total problem features

In [724]:
def table_3_F(df):
  total = table_3_e_observed + table_3_d_observed + table_3_c_observed + table_3_b_observed + table_3_a_observed
  return total
  
table_3_f_observed = table_3_F(df)
table_3_f_observed

3

###G - Identical cases

In [725]:
(df.duplicated(keep=False)).sum()

4621

In [726]:
def table_3_G(df):
  df_tmp = df.duplicated(keep=False)
  df_tmp = df_tmp[df_tmp]
  problem_G_count = df_tmp.shape[0]
  problem_G_rows = set(df_tmp.index)
  return problem_G_count, problem_G_rows

table_3_g_observed, table_3_g_index = table_3_G(df)

###H - Inconsistent cases 

In [727]:
 def table_3_H(df):
  return
table_3_h_observed = table_3_H(df)

###I - Cases with missing values

In [728]:
(((df.isna()).sum(axis=1))>0).sum()

0

In [729]:
def table_3_I(df):
  df_tmp = df.isna().sum(axis=1)
  df_tmp = df_tmp[df_tmp>0]
  problem_I_count = df_tmp.shape[0]
  problem_I_rows = set(df_tmp.index)
  return problem_I_count, problem_I_rows

table_3_i_observed, table_3_i_index = table_3_I(df)

###J - Cases with conflicting features values

In [730]:
j = []

In [731]:
j.append(~(df.NUMBER_OF_LINES >= df.LOC_TOTAL))

In [732]:
j.append(~(df.NUMBER_OF_LINES >= df.LOC_CODE_AND_COMMENT))

In [733]:
j.append(~(df.NUMBER_OF_LINES >= df.LOC_COMMENTS))

In [734]:
j.append(~(df.NUMBER_OF_LINES >= df.LOC_EXECUTABLE))

In [735]:
j.append(~(df.LOC_TOTAL >= df.LOC_EXECUTABLE))

In [736]:
j.append(~(df.LOC_TOTAL >= df.LOC_CODE_AND_COMMENT))

In [737]:
j.append(~(df.NUM_OPERANDS >= df.NUM_UNIQUE_OPERANDS))

In [738]:
j.append(~(df.NUM_OPERATORS >= df.NUM_UNIQUE_OPERATORS))

In [739]:
j.append(~(df.HALSTEAD_LENGTH == df.NUM_OPERATORS + df.NUM_OPERANDS))

In [740]:
j.append(~(df.CYCLOMATIC_COMPLEXITY <= df.NUM_OPERATORS + 1))

In [741]:
j.append(~(df.CALL_PAIRS <= df.NUM_OPERATORS))

In [742]:
y = (df.NUM_OPERATORS + df.NUM_OPERANDS)*np.log2(df.NUM_UNIQUE_OPERATORS + df.NUM_UNIQUE_OPERANDS)
y = y.replace(np.nan,0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [743]:
j.append((~np.isclose(df.HALSTEAD_VOLUME, y,atol=1)))

In [744]:
y = (2/df.NUM_UNIQUE_OPERATORS)*(df.NUM_UNIQUE_OPERANDS/df.NUM_OPERANDS)
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [745]:
j.append((~np.isclose(df.HALSTEAD_LEVEL, y,atol=1)))

In [746]:
y = (df.NUM_UNIQUE_OPERATORS/2)*(df.NUM_OPERANDS/df.NUM_UNIQUE_OPERANDS)
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [747]:
j.append((~np.isclose(df.HALSTEAD_DIFFICULTY, y,atol=1)))

In [748]:
y = df.HALSTEAD_VOLUME/df.HALSTEAD_DIFFICULTY
y = y.replace(np.nan,0)
y = y.replace(np.inf,0)

In [749]:
j.append((~np.isclose(df.HALSTEAD_CONTENT, y,atol=1)))

In [750]:
y = df.HALSTEAD_EFFORT/df.HALSTEAD_VOLUME
y = y.replace(np.nan,0)

In [751]:
j.append((~np.isclose(y,df.HALSTEAD_DIFFICULTY,atol=1)))

In [752]:
y = (df.HALSTEAD_EFFORT/18)

In [753]:
j.append((~np.isclose(df.HALSTEAD_PROG_TIME, y,atol=1)))

In [754]:
def table_3_J(df):
  j_conflicted_rows = sum(j)>0
  df_tmp = df.copy()
  df_tmp = df_tmp[j_conflicted_rows]
  problem_J_count = df_tmp.shape[0]
  problem_J_rows = set(df_tmp.index)
  return problem_J_count, problem_J_rows

table_3_j_observed, table_3_j_index = table_3_J(df)

### K - Cases with implausible values 

In [755]:
(df.LOC_TOTAL == 0).sum()

1084

In [756]:
((df<0).sum()).sum()

0

In [757]:
df_tmp = df[count_columns]
non_integer_rows = list(df_tmp.applymap(np.isreal).sum(axis=1)<df_tmp.shape[1])
non_integer_rows_index = []
for i in range(len(non_integer_rows)):
  if non_integer_rows[i]:
    non_integer_rows_index.append(i)
cases_non_integer = len(non_integer_rows_index)

In [758]:
def table_3_K(df):
  (df.LOC_TOTAL == 0).sum() + ((df<0).sum()).sum() + cases_non_integer
  df_tmp1 = df.LOC_TOTAL
  df_tmp1 = df_tmp1[df_tmp1==0]
  problem_K1_count = df_tmp1.shape[0]
  problem_K1_rows = set(df_tmp1.index)
  df_tmp2 = (df<0).sum(axis=1)
  df_tmp2 = df_tmp2[df_tmp2>0]
  problem_K2_count = df_tmp2.shape[0]
  problem_K2_rows = set(df_tmp2.index)
  imp = problem_K1_count + problem_K2_count + cases_non_integer
  return imp, problem_K1_rows, problem_K2_rows
table_3_k_observed, table_3_K1_index, table_3_K2_index = table_3_K(df)

### L - Total problem cases DS'

In [759]:
def table_3_L(df):
  total = table_3_i_index | table_3_j_index | table_3_K1_index | table_3_K2_index | set(non_integer_rows_index)
  return len(set(total))
  
table_3_l_observed = table_3_L(df)

### M - Total problem cases DS''

In [760]:
def table_3_M(df):
  total = table_3_g_index | table_3_i_index | table_3_j_index | table_3_K1_index | table_3_K2_index | set(non_integer_rows_index)
  return len(set(total))
  
table_3_m_observed = table_3_M(df)

### Results

In [761]:
data = []
for message, expected, observed in [
    ("Identical features", 0, table_3_a_observed),
    ("Constant features", 0, table_3_b_observed),
    ("Features with missing values", 0, table_3_c_observed),
    ("Features with conflicting values", 2, table_3_d_observed),
    ("Features with implausible values", 1, table_3_e_observed),
    ("Total problem features", 3, table_3_f_observed),
    ("Identical cases", 4621, table_3_g_observed),
    ("Inconsistent cases", 0, table_3_h_observed),
    ("Cases with missing values", 0, table_3_i_observed),
    ("Cases with conflicting features values", 129, table_3_j_observed),
    ("Cases with implausible values", 1084, table_3_k_observed),
    ("Total problem cases DS'", 1163, table_3_l_observed),
    ("Total problem cases DS''", 4297, table_3_m_observed),
]:
  data.append([message,expected,observed])

df_result = pd.DataFrame(data,columns=['Message','Expected','Observed'])

display(Markdown("**Table 3 (by cases) Comparison**"))
df_result


**Table 3 (by cases) Comparison**

Unnamed: 0,Message,Expected,Observed
0,Identical features,0,0.0
1,Constant features,0,0.0
2,Features with missing values,0,0.0
3,Features with conflicting values,2,2.0
4,Features with implausible values,1,1.0
5,Total problem features,3,3.0
6,Identical cases,4621,4621.0
7,Inconsistent cases,0,
8,Cases with missing values,0,0.0
9,Cases with conflicting features values,129,129.0


## Cleaning dataset

###Step 1: removing cases with implausible values

In [762]:
dfcopy = df.copy()
index_step1 = set(table_3_K1_index | table_3_K2_index | set(non_integer_rows_index))
dfcopy = dfcopy.drop(index_step1)
df.shape, dfcopy.shape

((5589, 37), (4505, 37))

###Step 2: removing cases with conflict feature values

In [763]:
index_step2 = table_3_j_index
index_remove_step2 = index_step2 - index_step1
dfcopy = dfcopy.drop(index_remove_step2)
df.shape, dfcopy.shape

((5589, 37), (4426, 37))

###Step 3: removing identical cases

In [764]:
df_tmp = df.duplicated(keep='first')
df_tmp = df_tmp[df_tmp]
table_3_g_index = set(df_tmp.index)

In [765]:
index_step3 = table_3_g_index
index_remove_step3 = index_step3 - index_step2 - index_step1
dfcopy = dfcopy.drop(index_remove_step3)
df.shape, dfcopy.shape

((5589, 37), (1296, 37))

###Step 4: removing inconsistent cases

###Step 5: removing cases with missing values

In [766]:
index_step5 = table_3_i_index
index_remove_step5 = index_step5 - index_step3 - index_step2 - index_step1
dfcopy = dfcopy.drop(index_remove_step5)
df.shape, dfcopy.shape

((5589, 37), (1296, 37))

###Step 6: removing constant features

In [767]:
index_step6 = table_3_b_index
index_remove_step6 = index_step6 - index_step5 - index_step3 - index_step2 - index_step1
dfcopy = dfcopy.drop(index_step6, axis=1)
df.shape, dfcopy.shape

((5589, 37), (1296, 37))

###Step 7: removing identical features

In [768]:
index_step7 = table_3_a_index
index_remove_step7 = index_step7 - index_step6 - index_step5 - index_step3 - index_step2 - index_step1
dfcopy = dfcopy.drop(index_step7, axis=1)
df.shape, dfcopy.shape

((5589, 37), (1296, 37))

##Saving dataset

In [769]:
dfcopy.to_pickle(f"{ROOT}/data/data.pkl")