In [103]:
def import_data(path, n):
  import pandas as pd, numpy as np
  if n is None: n = df.shape[0]
  df = pd.read_csv(path)
  df = df.sample(n=n, random_state=1)
  # Artificially creating problems so that we can prepare for missing data
  # Delete these four lines when the pipeline is ready
  df['missing'] = np.nan
  df.iloc[2:5, 4] = np.nan
  df.iloc[2:4, 2:6] = np.nan
  df.iloc[0] = np.nan
  return df

In [104]:
def setup_model(df, label):
  import pandas as pd
  y = df[label]
  X = df.drop(columns=[label])
  return [y, X]

In [105]:
def dummy_code(df):
  import pandas as pd
  df = pd.get_dummies(df, drop_first=True)
  return df

In [106]:
def missing_data(df, label, row_thresh=0.90, col_thresh=0.70):
  import pandas as pd, numpy as np

  # Drop rows with the label missing
  df.dropna(axis='rows', subset=[label], inplace=True)

  # Drop rows and columns with 100% missing
  df.dropna(axis='columns', thresh=1, inplace=True)
  df.dropna(axis='rows', thresh=1, inplace=True)

  # Drop rows with < threshold existing
  df.dropna(axis='rows', thresh=round(df.shape[1]*row_thresh), inplace=True)

  # Drop columns with < threshold existing
  df.dropna(axis='columns', thresh=round(df.shape[0]*col_thresh), inplace=True)

  # Impute remaining missing values
  if df.isna().sum().sum() > 0:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer, KNNImputer

    y, X = setup_model(df, label)
    X = dummy_code(X)
    imp = IterativeImputer(max_iter=10, random_state=1)
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    df = X.merge(y, left_index=True, right_index=True)

  # Return the cleaned DataFrame
  return df

In [107]:
def bin_categories(df, features=[], cutoff=0.05, replace_with='Other', messages=True):
  import pandas as pd
  
  if len(features) == 0: features = df.columns

  for feat in features:
    if feat in df.columns:
      if not pd.api.types.is_numeric_dtype(df[feat]):
        other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
        df.loc[df[feat].isin(other_list), feat] = replace_with
        if messages: print(f'{feat} has been binned by setting {other_list} to {replace_with}')
    else:
      if messages: print(f'{feat} not found in the DataFrame provided. No binning performed')

  return df

In [108]:
# //split data and cross validation are optional

def split_data(df, label, random="False"):
    from sklearn.model_selection import train_test_split
    y, X = setup_model(df, label)
    random_state = 1 
    if random: random_state = 0
    return train_test_split(X, y, test_size=.3, random_state=1)

In [109]:
def cross_validate(df, label, k=5, random=False, repeat=True):
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean

  y, X = setup_model(df, label)

  random_state=1
  if random: random_state=0

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=random_state)
  else:
    cv = KFold(n_splits=k, random_state=random_state, shuffle=True)

  if pd.api.types.is_numeric_dtype(df[label]):
    from sklearn.ensemble import RandomForestRegressor
    scores = cross_val_score(RandomForestRegressor(), X, y, scoring='r2', cv=cv)
  else:
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import RidgeClassifier, LogisticRegression

    model_rfc = RandomForestClassifier(random_state=random_state)
    model_gbc = GradientBoostingClassifier(random_state=random_state)
    model_log = LogisticRegression(random_state=random_state, max_iter=100)
    model_ridge = RidgeClassifier(random_state=random_state)

    scores_rfc = cross_val_score(model_rfc, X, y, scoring='accuracy', cv=cv)
    scores_gbc = cross_val_score(model_gbc, X, y, scoring='accuracy', cv=cv)
    scores_log = cross_val_score(model_log, X, y, scoring='accuracy', cv=cv)
    scores_ridge = cross_val_score(model_ridge, X, y, scoring='accuracy', cv=cv)

    models = {mean(scores_rfc):model_rfc, mean(scores_gbc):model_gbc, mean(scores_log):model_log, mean(scores_ridge):model_ridge}

    print(f'Accuracy (RandomForest):\t{mean(scores_rfc)}')
    print(f'Accuracy (GradientBoosting):\t{mean(scores_gbc)}')
    print(f'Accuracy (Ridge):\t\t{mean(scores_log)}')
    print(f'Accuracy (Logistic):\t\t{mean(scores_ridge)}')

    return models[max(models.keys())].fit(X, y)


    #  want the best balance of accuracy and speed
    #  lowest result is the best result 

In [110]:
def save_model(model, file_name):
    import pickle 
    pickle.dump(model, open(file_name, 'wb'))

def load_model(file_name):
    import pickle
    return pickle.load(open(file_name, 'rb'))

In [111]:
import pandas as pd
pd.set_option('display.max_columns', None)
# Step 1: Import the data
df = import_data('network_traffic.csv', n=df.shape[0])
# Step 2: Data Preparation
df = missing_data(df, 'attack',.92)

# step 3A: Data Segregation Train/Test split
# X_train, X_test, y_train, y_test = split_data(df, 'attack')
df_5 = bin_categories(df.copy(), cutoff=0.05, messages=False)
# df_2 = bin_categories(df.copy(), cutoff=0.02)
# df_1 = bin_categories(df.copy(), cutoff=0.01)


# step 3A: Data Segregation Cross Validation
# cross_validate(df, 'attack', 5, repeat=False) #builds 5 models - probably the best one 
# cross_validate(df, 'attack', 10, repeat=False) #builds 10 models
# cross_validate(df, 'attack', 5, repeat=True) #builds 25 modles 
# cross_validate(df, 'attack', 10, repeat=True) #builds 50 models

#  want the best balance of accuracy and speed
#  lowest result is the best result 

cross_validate(df_5, 'attack', 5, repeat=False) #builds 5 models - probably the best one 
# cross_validate(df_2, 'attack', 5, repeat=False) #builds 5 models - probably the best one 
# cross_validate(df_1, 'attack', 5, repeat=False) #builds 5 models - probably the best one 



#
# df.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy (RandomForest):	0.9083333333333332
Accuracy (GradientBoosting):	0.8816666666666666
Accuracy (Ridge):		0.8683333333333334
Accuracy (Logistic):		0.9075000000000001


In [112]:
# create missing data

In [113]:
import pandas as pd
pd.set_option('display.max_columns', None)
# Step 1: Import the data
df = import_data('network_traffic.csv', n=df.shape[0])
# Step 2: Data Preparation
df = missing_data(df, 'attack',.92)
df = bin_categories(df.copy(), cutoff=0.05, messages=False)
# Step 3: Data Segregation: Crossvalidate
model = cross_validate(df, 'attack', 5, repeat=False)
# Step 4: Save/Deploy the trained model
save_model(model, 'saved_model.sav')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy (RandomForest):	0.878095238095238
Accuracy (GradientBoosting):	0.8228571428571427
Accuracy (Ridge):		0.8352380952380953
Accuracy (Logistic):		0.8761904761904761


In [114]:
# Impute or predict values