In [31]:
def import_data(path):
  import pandas as pd, numpy as np
  df = pd.read_csv(path)
  # Artificially creating problems so that we can prepare for missing data
  # Delete these four lines when the pipeline is ready
  df = df.sample(n=2000, random_state=1)
  df['missing'] = np.nan
  df.iloc[2:5, 4] = np.nan
  df.iloc[2:4, 2:6] = np.nan
  df.iloc[0] = np.nan
  return df

In [32]:
def setup_model(df, label):
  import pandas as pd
  y = df[label]
  X = df.drop(columns=[label])
  return [y, X]

In [33]:
def dummy_code(df):
  import pandas as pd
  df = pd.get_dummies(df, drop_first=True)
  return df

In [34]:
def missing_data(df, label, row_thresh=0.90, col_thresh=0.70):
  import pandas as pd, numpy as np

  # Drop rows with the label missing
  df.dropna(axis='rows', subset=[label], inplace=True)

  # Drop rows and columns with 100% missing
  df.dropna(axis='columns', thresh=1, inplace=True)
  df.dropna(axis='rows', thresh=1, inplace=True)

  # Drop rows with < threshold existing
  df.dropna(axis='rows', thresh=round(df.shape[1]*row_thresh), inplace=True)

  # Drop columns with < threshold existing
  df.dropna(axis='columns', thresh=round(df.shape[0]*col_thresh), inplace=True)

  # Impute remaining missing values
  if df.isna().sum().sum() > 0:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer, KNNImputer

    y, X = setup_model(df, label)
    X = dummy_code(X)
    imp = IterativeImputer(max_iter=10, random_state=1)
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    df = X.merge(y, left_index=True, right_index=True)

  # Return the cleaned DataFrame
  return df

In [35]:
def split_data(df, label, random="False"):
    from sklearn.model_selection import train_test_split
    y, X = setup_model(df, label)
    random_state = 1 
    if random: random_state = 0
    return train_test_split(X, y, test_size=.3, random_state=1)

In [36]:
def cross_validate(df, label, random = False, repeat= True):
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    import pandas as pd
    import numpy as np

    y, X = setup_model(df, label)

    random_state = 1
    if random: random_state = 0

    if repeat:
        cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state)
    else:
        cv = KFold(n_splits=5, random_state=random_state)

    if pd.api.types.is_numeric_dtype(df[label]):
        scores = cross_val_score(RandomForestRegressor(), X, y, scoring='r2', cv=cv)
    else: 
        scores = cross_val_score(RandomForestClassifier(), X, y, scoring='accuracy', cv=cv)

    return np.mean(scores)

In [37]:
import pandas as pd
pd.set_option('display.max_columns', None)
# Step 1: Import the data
df = import_data('network_traffic.csv')
# Step 2: Data Preparation
df = missing_data(df, 'attack',.92)

# step 3A: Data Segregation Train/Test split
X_train, X_test, y_train, y_test = split_data(df, 'attack')

# step 3A: Data Segregation Cross Validation
cross_validate(df, 'attack')

print(X_train.shape)
df.head()

(1397, 110)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag,protocol_type_tcp,protocol_type_udp,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,service_ctf,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_efs,service_exec,service_finger,service_ftp,service_ftp_data,service_gopher,service_hostnames,service_http,service_http_443,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pop_2,service_pop_3,service_printer,service_private,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack
5800,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,449.0,449.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,6.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,smurf
112275,0.0,2589.154277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,12.0,246.0,1.0,0.0,0.08,0.15,0.0,0.0,1.0,0.98,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
46320,0.0,283.0,3802.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,51.0,255.0,1.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
4445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0,255.0,1.0,0.0,0.25,0.07,0.0,0.0,1.0,0.74,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
37522,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,337.0,2.0,0.0,0.0,1.0,1.0,0.01,0.52,0.0,255.0,2.0,0.01,0.52,1.0,0.0,0.0,0.0,1.0,1.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,portsweep


In [38]:
# create missing data

In [39]:
# Impute or predict values