<a href="https://colab.research.google.com/github/aneeshabizzul/Hackathon-/blob/main/accuracy46.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Function to generate historical data
def generate_historical_data(n=10000):
    np.random.seed(42)
    data = {
        "Company": np.random.choice([1, 2, 3], n),
        "Account": np.random.choice([1619283, 1619205], n),
        "GL Balance": np.random.randint(20000, 100000, n),
        "IHub Balance": np.random.randint(20000, 100000, n),
    }
    df = pd.DataFrame(data)
    df["Balance Difference"] = df["GL Balance"] - df["IHub Balance"]
    df["Match Status"] = np.where(abs(df["Balance Difference"]) > 15000, "Break", "Match")
    df["Discrepancy Category"] = np.where(abs(df["Balance Difference"]) > 30000, "Significant Discrepancy", "Minor Discrepancy")
    return df

# Function to generate current data with anomalies
def generate_current_data(n=100):
    np.random.seed(99)
    data = {
        "Company": np.random.choice([1, 2, 3], n),
        "Account": np.random.choice([1619283, 1619205], n),
        "GL Balance": np.random.randint(20000, 100000, n),
        "IHub Balance": np.random.randint(20000, 100000, n),
    }
    df = pd.DataFrame(data)
    df["Balance Difference"] = df["GL Balance"] - df["IHub Balance"]
    df["Match Status"] = np.where(abs(df["Balance Difference"]) > 15000, "Break", "Match")
    df["Discrepancy Category"] = np.where(abs(df["Balance Difference"]) > 30000, "Significant Discrepancy", "Minor Discrepancy")
    return df

# Generate datasets
historical_df = generate_historical_data()
current_df = generate_current_data()

# Save datasets
historical_df.to_csv("historical_data.csv", index=False)
current_df.to_csv("current_data.csv", index=False)

# Feature selection
features = ["Company", "Account", "GL Balance", "IHub Balance", "Balance Difference"]
scaler = StandardScaler()
historical_scaled = scaler.fit_transform(historical_df[features])
current_scaled = scaler.transform(current_df[features])

# One-Class SVM for anomaly detection
oc_svm = OneClassSVM(nu=0.02, kernel='rbf', gamma='scale')
oc_svm.fit(historical_scaled)
current_df["Anomaly_SVM"] = oc_svm.predict(current_scaled)
current_df["Anomaly_SVM"] = current_df["Anomaly_SVM"].apply(lambda x: "Anomaly" if x == -1 else "Normal")

# DBSCAN for clustering
dbscan = DBSCAN(eps=np.mean(np.abs(historical_df["Balance Difference"])) * 0.5, min_samples=10)
current_df["Cluster_DBSCAN"] = dbscan.fit_predict(current_scaled)

# Isolation Forest for anomaly detection
iso_forest = IsolationForest(n_estimators=300, contamination=0.02, random_state=42)
iso_forest.fit(historical_scaled)
current_df["Anomaly_IF"] = iso_forest.predict(current_scaled)
current_df["Anomaly_IF"] = current_df["Anomaly_IF"].apply(lambda x: "Anomaly" if x == -1 else "Normal")

# Autoencoder for anomaly detection
model = Sequential([
    Dense(128, activation='relu', input_shape=(len(features),)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(len(features), activation='linear')
])

model.compile(optimizer='adam', loss='mse')
model.fit(historical_scaled, historical_scaled, epochs=300, batch_size=8, verbose=0)

reconstruction = model.predict(current_scaled)
reconstruction_error = np.mean(np.square(reconstruction - current_scaled), axis=1)
threshold = np.mean(reconstruction_error) + 2 * np.std(reconstruction_error)
current_df["Anomaly_AE"] = reconstruction_error > threshold
current_df["Anomaly_AE"] = current_df["Anomaly_AE"].apply(lambda x: "Anomaly" if x else "Normal")

# Evaluate accuracy (assuming 'Break' cases are anomalies)
y_true = (current_df["Match Status"] == "Break").astype(int)
y_pred_svm = (current_df["Anomaly_SVM"] == "Anomaly").astype(int)
y_pred_dbscan = (current_df["Cluster_DBSCAN"] == -1).astype(int)
y_pred_if = (current_df["Anomaly_IF"] == "Anomaly").astype(int)
y_pred_ae = (current_df["Anomaly_AE"] == "Anomaly").astype(int)

accuracy_svm = accuracy_score(y_true, y_pred_svm)
accuracy_dbscan = accuracy_score(y_true, y_pred_dbscan)
accuracy_if = accuracy_score(y_true, y_pred_if)
accuracy_ae = accuracy_score(y_true, y_pred_ae)

print("One-Class SVM Accuracy:", accuracy_svm)
print("DBSCAN Accuracy:", accuracy_dbscan)
print("Isolation Forest Accuracy:", accuracy_if)
print("Autoencoder Accuracy:", accuracy_ae)

# Display sample data
print("Historical Data Sample:")
print(historical_df.head())
print("\nCurrent Data Sample:")
print(current_df.head())


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
One-Class SVM Accuracy: 0.42
DBSCAN Accuracy: 0.41
Isolation Forest Accuracy: 0.42
Autoencoder Accuracy: 0.46
Historical Data Sample:
   Company  Account  GL Balance  IHub Balance  Balance Difference  \
0        3  1619283       61298         60965                 333   
1        1  1619205       38669         59440              -20771   
2        3  1619205       40943         83118              -42175   
3        3  1619205       58785         69217              -10432   
4        1  1619283       83427         51139               32288   

  Match Status     Discrepancy Category  
0        Match        Minor Discrepancy  
1        Break        Minor Discrepancy  
2        Break  Significant Discrepancy  
3        Match        Minor Discrepancy  
4        Break  Significant Discrepancy  

Current Data Sample:
   Company  Account  GL Balance  IHub Balance  Balance Difference  \
0        2  1619205       83127     