In [12]:
def import_data(path):
  import pandas as pd, numpy as np
  df = pd.read_csv(path)
  # Artificially creating problems so that we can prepare for missing data
  # Delete these four lines when the pipeline is ready
  df = df.sample(n=2000, random_state=1)
  df['missing'] = np.nan
  df.iloc[2:5, 4] = np.nan
  df.iloc[2:4, 2:6] = np.nan
  df.iloc[0] = np.nan
  return df

In [13]:
def setup_model(df, label):
  import pandas as pd
  y = df[label]
  X = df.drop(columns=[label])
  return [y, X]

In [14]:
def dummy_code(df):
  import pandas as pd
  df = pd.get_dummies(df, drop_first=True)
  return df

In [15]:
def missing_data(df, label, row_thresh=0.90, col_thresh=0.70):
  import pandas as pd, numpy as np

  # Drop rows with the label missing
  df.dropna(axis='rows', subset=[label], inplace=True)

  # Drop rows and columns with 100% missing
  df.dropna(axis='columns', thresh=1, inplace=True)
  df.dropna(axis='rows', thresh=1, inplace=True)

  # Drop rows with < threshold existing
  df.dropna(axis='rows', thresh=round(df.shape[1]*row_thresh), inplace=True)

  # Drop columns with < threshold existing
  df.dropna(axis='columns', thresh=round(df.shape[0]*col_thresh), inplace=True)

  # Impute remaining missing values
  if df.isna().sum().sum() > 0:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer, KNNImputer

    y, X = setup_model(df, label)
    X = dummy_code(X)
    imp = IterativeImputer(max_iter=10, random_state=1)
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    df = X.merge(y, left_index=True, right_index=True)

  # Return the cleaned DataFrame
  return df

In [16]:
def split_data(df, label, random="False"):
    from sklearn.model_selection import train_test_split
    y, X = setup_model(df, label)
    random_state = 1 
    if random: random_state = 0
    return train_test_split(X, y, test_size=.3, random_state=1)

In [20]:
def cross_validate(df, label, random = False, repeat= True):
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    import pandas as pd

    y, X = setup_model(df, label)

    random_state = 1
    if random: random_state = 0

    if repeat:
        cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state)
    else:
        cv = KFold(n_splits=5, random_state=random_state)

    if pd.api.types.is_numeric_dtype(df[label]):
        scores = cross_val_score(RandomForestRegressor(), X, y, scoring='r2', cv=cv)
    else: 
        scores = cross_val_score(RandomForestClassifier(), X, y, scoring='accuracy', cv=cv)

    return np.mean(scores)

In [23]:
import pandas as pd
pd.set_option('display.max_columns', None)
# Step 1: Import the data
df = import_data('network_traffic.csv')
# Step 2: Data Preparation
df = missing_data(df, 'attack',.92)

# step 3A: Data Segregation Train/Test split
X_train, X_test, y_train, y_test = split_data(df, 'attack')

# step 3A: Data Segregation Cross Validation
cross_validate(df, 'attack')

print(X_train.shape)
df.head()

In [None]:
# create missing data

In [None]:
# Impute or predict values