In [13]:
#Step 1 - Data Cleaning + Memory Usage Adjustment

In [14]:
#Begin by importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

In [15]:
#Function to reduce the memory usage
def reduce_memory_usage(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [16]:
#Function to inspect and correct any mixed-type columns
def inspect_and_correct(df):
    for col in df.columns:
        mixed_types = df[col].apply(type).nunique() > 1
        if mixed_types:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric if possible
            except Exception as e:
                df[col] = df[col].astype(str)  # Convert to string otherwise
    return df

In [17]:
#Loading data with low_memory option
file_path = 'ciciot2022.csv'
df = pd.read_csv(file_path, low_memory=False)

In [18]:
# Reduce memory usage and clean data
df = reduce_memory_usage(df)
df = inspect_and_correct(df)

In [19]:
# Handling Missing Values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

In [20]:
# Removing Duplicates
df.drop_duplicates(inplace=True)

In [21]:
# Feature Engineering
# Optimized One-Hot and Label Encoding
label_encoder = LabelEncoder()
one_hot_threshold = 10  # Define a threshold for unique values

for col in df.select_dtypes(include=['object']).columns:
    if df[col].nunique() <= one_hot_threshold:
        df = pd.get_dummies(df, columns=[col], drop_first=True)
    else:
        df[col] = label_encoder.fit_transform(df[col])

In [22]:
# Normalization/Standardization
numerical_columns = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [23]:
# Dimensionality Reduction
pca = PCA(n_components=0.95)  # Keep 95% of variance
principalComponents = pca.fit_transform(df[numerical_columns])
df_pca = pd.DataFrame(data=principalComponents)

In [24]:
# Data Splitting
#Also need to determine our target column on what we're basing our ML algorithm on to detect anamolies/issues
X = df_pca
y = df['average']  # Replace with your actual target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Saving the Processed Data
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [26]:
#Successful Print Statement
print("Data cleaning and preprocessing completed.")

Data cleaning and preprocessing completed.


In [27]:
#Step 2 - Isolation Forest Training + Anomaly Detection

In [28]:
import pandas as pd
from sklearn.ensemble import IsolationForest

In [29]:
# Load preprocessed data
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')

In [30]:
# Isolation Forest Model using 'auto'
iso_forest = IsolationForest(n_estimators=100, contamination='auto')  # 'auto' lets the algorithm define the threshold
iso_forest.fit(X_train)

In [31]:
# Predict anomalies on training and test data
train_preds = iso_forest.predict(X_train)
test_preds = iso_forest.predict(X_test)

In [32]:
# Convert predictions to a binary label with normal data as "1" and anomalies as "-1"
# Here, -1 indicates an anomaly
X_train['anomaly'] = train_preds
X_test['anomaly'] = test_preds

In [33]:
#Analyze the results to check the proportion of anomalies detected
anomalies_train = X_train[X_train['anomaly'] == -1]
anomalies_test = X_test[X_test['anomaly'] == -1]

In [34]:
# Calculate the percentage of anomalies
percentage_anomalies_train = (len(anomalies_train) / len(X_train)) * 100
percentage_anomalies_test = (len(anomalies_test) / len(X_test)) * 100

In [35]:
# Print the number and percentage of anomalies in training and test data
print(f"Anomalies in Training Data: {len(anomalies_train)} / {len(X_train)} ({percentage_anomalies_train:.2f}% of data is anomalies)")
print(f"Anomalies in Test Data: {len(anomalies_test)} / {len(X_test)} ({percentage_anomalies_test:.2f}% of data is anomalies)")

Anomalies in Training Data: 3281 / 183604 (1.79% of data is anomalies)
Anomalies in Test Data: 829 / 45902 (1.81% of data is anomalies)
