# Anomaly Detection with Isolation Forest

In [None]:
import zipfile
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib


## Data Extraction

In [None]:
extract_dir = '/mnt/data/dataFolder'

with zipfile.ZipFile('/mnt/data/dataFolder.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

file_names = os.listdir(extract_dir)
file_names


## Load and Concatenate CSV Files

In [None]:
data_dir = os.path.join(extract_dir, 'dataFolder')
data_file_names = os.listdir(data_dir)
data_frames = {}

for file_name in data_file_names:
    file_path = os.path.join(data_dir, file_name)
    df = pd.read_csv(file_path)
    data_frames[file_name] = df

combined_df = pd.concat(data_frames.values(), ignore_index=True)
combined_df.head()


## Handle Missing Values

In [None]:
combined_df = combined_df.fillna(combined_df.mean())
combined_df.isnull().sum()


## Train the Isolation Forest Model

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_df)
model = IsolationForest(contamination=0.01)
model.fit(scaled_features)


## Save the Model

In [None]:
model_path = '/mnt/data/isolation_forest_model.pkl'
scaler_path = '/mnt/data/standard_scaler.pkl'
joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)
model_path, scaler_path


## Feature Engineering - PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
principal_df.head()


## Save the PCA object

In [None]:
pca_path = '/mnt/data/pca.pkl'
joblib.dump(pca, pca_path)
pca_path
