# Importing Library

In [3]:
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, ParameterGrid
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the dataset

In [4]:
df = pd.read_csv('sales_data.csv')  # Replace 'sales_data.csv' with your actual file name or path

# Standard Deviation calculation and normalising

In [5]:
# Select the relevant columns for anomaly detection (e.g., sales amount, store number, date)
selected_columns = ['sales_amount', 'store_number', 'date']
data = df[selected_columns]

# Calculate the standard deviation of sales_amount for each store
store_std = data.groupby('store_number')['sales_amount'].std().reset_index()
store_std.rename(columns={'sales_amount': 'std_dev'}, inplace=True)

# Normalize the sales_amount using the store's standard deviation
data = pd.merge(data, store_std, on='store_number')
data['sales_amount_normalized'] = (data['sales_amount'] - data['std_dev']) / data['std_dev']

# Loading the anomally algorithms

In [6]:
# Initialize anomaly detection models
isolation_forest = IsolationForest(contamination=0.05)
local_outlier_factor = LocalOutlierFactor(contamination=0.05)
one_class_svm = OneClassSVM(nu=0.05)
robust_covariance = EllipticEnvelope(contamination=0.05)

In [None]:
# Initialize anomaly detection models
isolation_forest = IsolationForest()
local_outlier_factor = LocalOutlierFactor()
one_class_svm = OneClassSVM()
robust_covariance = EllipticEnvelope()

# Define parameter grids for hyperparameter tuning
param_grid_if = {'n_estimators': [100, 200, 300], 'contamination': [0.05, 0.1, 0.15]}
param_grid_lof = {'n_neighbors': [5, 10, 15], 'contamination': [0.05, 0.1, 0.15]}
param_grid_ocsvm = {'nu': [0.05, 0.1, 0.15]}

# Create an empty dataframe to store the results
anomalous_dates_df = pd.DataFrame(columns=['store_number', 'date'])

# Iterate over each store
for store in data['store_number'].unique():
    # Filter data for the current store
    store_data = data[data['store_number'] == store]

    # Check for infinity or large values and replace them with NaN
    store_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    store_data.dropna(subset=['sales_amount_normalized'], inplace=True)

    if store_data.empty:
        continue

    # Perform hyperparameter tuning using cross-validation for each model
    best_score_if = -np.inf
    best_params_if = None
    for params in ParameterGrid(param_grid_if):
        isolation_forest.set_params(**params)
        isolation_forest.fit(store_data[['sales_amount_normalized']])  # Fit the model
        scores = isolation_forest.decision_function(store_data[['sales_amount_normalized']])
        mean_score = np.mean(scores)
        if mean_score > best_score_if:
            best_score_if = mean_score
            best_params_if = params

    best_score_lof = -np.inf
    best_params_lof = None
    for params in ParameterGrid(param_grid_lof):
        local_outlier_factor.set_params(**params)
        local_outlier_factor.fit(store_data[['sales_amount_normalized']])  # Fit the model
        scores = -local_outlier_factor.negative_outlier_factor_
        mean_score = np.mean(scores)
        if mean_score > best_score_lof:
            best_score_lof = mean_score
            best_params_lof = params

    best_score_ocsvm = -np.inf
    best_params_ocsvm = None
    for params in ParameterGrid(param_grid_ocsvm):
        one_class_svm.set_params(**params)
        one_class_svm.fit(store_data[['sales_amount_normalized']])  # Fit the model
        scores = one_class_svm.decision_function(store_data[['sales_amount_normalized']])
        mean_score = np.mean(scores)
        if mean_score > best_score_ocsvm:
            best_score_ocsvm = mean_score
            best_params_ocsvm = params

    # Fit the models to the store's normalized sales_amount data using the best parameters from cross-validation
    isolation_forest.set_params(**best_params_if)
    local_outlier_factor.set_params(**best_params_lof)
    one_class_svm.set_params(**best_params_ocsvm)
    isolation_forest.fit(store_data[['sales_amount_normalized']])
    local_outlier_factor.fit(store_data[['sales_amount_normalized']])
    one_class_svm.fit(store_data[['sales_amount_normalized']])
    scores_rc = robust_covariance.fit(store_data[['sales_amount_normalized']]).decision_function(store_data[['sales_amount_normalized']])

    # Predict the anomaly scores for each model
    scores_if = isolation_forest.decision_function(store_data[['sales_amount_normalized']])
    scores_lof = -local_outlier_factor.negative_outlier_factor_
    scores_ocsvm = one_class_svm.decision_function(store_data[['sales_amount_normalized']])

    # Ensemble anomaly scores by averaging
    ensemble_scores = (scores_if + scores_lof + scores_ocsvm + scores_rc) / 4

    # Add the anomalous dates to the dataframe
    store_anomalous_dates = store_data[ensemble_scores < 0]['date']
    anomalous_dates_df = anomalous_dates_df.append(pd.DataFrame({'store_number': store, 'date': store_anomalous_dates}))

# Reset the index of the resulting dataframe
anomalous_dates_df.reset_index(drop=True, inplace=True)

# Print the dataframe with anomalous store numbers and dates
print(anomalous_dates_df)


In [None]:
anomalous_dates_df.to_csv('anomalous_dates_df.csv')