In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models,layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("./creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df=df.drop("Time",axis=1)

In [4]:
x=df.drop("Class",axis=1)
y=df["Class"]

In [5]:
scaler=StandardScaler()
x=scaler.fit_transform(x)

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [7]:
encoder=models.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64,activation="relu"),
    layers.Dense(32,activation="relu"),
    layers.Dense(16,activation="relu"),
])

In [8]:
decoder=models.Sequential([
    layers.Input((16,)),
    layers.Dense(32,activation="relu"),
    layers.Dense(64,activation="relu"),
    layers.Dense(x_train.shape[1],activation="linear"),
])

In [9]:
autoencoder=models.Sequential([
    encoder,
    decoder
])

In [10]:
autoencoder.compile(optimizer="adam",loss="mean_squared_error", metrics = ['accuracy'])

In [None]:
history=autoencoder.fit(x_train,x_train,epochs=10,validation_data=(x_test,x_test))

Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.4405 - loss: 0.4273 - val_accuracy: 0.7513 - val_loss: 0.1092
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.7709 - loss: 0.0987 - val_accuracy: 0.7905 - val_loss: 0.0799
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8069 - loss: 0.0752 - val_accuracy: 0.8077 - val_loss: 0.0747
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.8211 - loss: 0.0647 - val_accuracy: 0.8515 - val_loss: 0.0484
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.8373 - loss: 0.0550 - val_accuracy: 0.8570 - val_loss: 0.0438
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.8425 - loss: 0.0541 - val_accuracy: 0.8356 - val_loss: 0.0508
Epoch 7/10

In [None]:
test_acc,test_loss = autoencoder.evaluate(x_test, x_test)
print("Test Loss: ",test_loss)
print("Accuracy ",test_acc)

In [None]:
predictions=autoencoder.predict(x_test)
mse=np.mean((x_test-predictions)**2,axis=1)
threshold = np.percentile(mse, 95)
outliers = mse> threshold
print("Confusion Matrix:\n", confusion_matrix(y_test, outliers))
print("Classification report:\n", classification_report(y_test, outliers))

In [None]:
num_anomalies = np.sum(y_test[outliers] == 1)

In [None]:
print(f'Number of anomalies: {num_anomalies}')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('MSLE Loss')
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
'''
Here’s a detailed line-by-line explanation of the code along with potential viva questions and answers:

---

### Code Explanation

```python
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
```

**Explanation**:  
- `pandas` and `numpy` are used for data manipulation and handling numerical data respectively.
- `tensorflow` and `tensorflow.keras` are imported to create and train the neural network.
- `train_test_split` is used to split the dataset into training and testing sets.
- `StandardScaler` is used for feature scaling (standardization).
- `confusion_matrix` and `classification_report` are used for evaluating the model's performance.
- `matplotlib.pyplot` is used for plotting the loss curve.

**Viva Questions**:
1. **Why are `pandas` and `numpy` necessary here?**
   - `pandas` is used for data manipulation (e.g., reading CSV files and managing DataFrames), and `numpy` is used for array handling, especially during numerical operations.
   
2. **What is the purpose of `tensorflow.keras` in this code?**
   - It provides utilities to build and train deep learning models, including the layers and optimizers used in this neural network model.

3. **Why is `train_test_split` used?**
   - It splits the dataset into training and testing sets to evaluate model performance on unseen data.

4. **What is `StandardScaler` used for?**
   - It standardizes features by removing the mean and scaling to unit variance, helping the model learn effectively.

---

```python
df = pd.read_csv("./creditcard.csv")
df.head()
df = df.drop("Time", axis=1)
x = df.drop("Class", axis=1)
y = df["Class"]
```

**Explanation**:
- `pd.read_csv("./creditcard.csv")`: Loads the `creditcard.csv` file into a pandas DataFrame.
- `df.head()`: Displays the first few rows of the DataFrame for a quick preview (though the result isn't used here).
- `df.drop("Time", axis=1)`: Removes the `Time` column from the DataFrame, which is not necessary for the model.
- `x = df.drop("Class", axis=1)`: Defines `x` as the feature set by removing the `Class` column.
- `y = df["Class"]`: Defines `y` as the target variable (`Class`).

**Viva Questions**:
1. **Why drop the `Time` column?**
   - `Time` may not have any meaningful relationship with fraud prediction and could add noise to the model.

2. **What is the purpose of `x` and `y`?**
   - `x` is the input feature set, and `y` is the target variable for classification (whether a transaction is fraudulent or not).

---

```python
scaler = StandardScaler()
x = scaler.fit_transform(x)
```

**Explanation**:
- `StandardScaler()` initializes the scaler to standardize the features.
- `scaler.fit_transform(x)` applies the scaler on `x` to standardize the data.

**Viva Question**:
1. **Why standardize the data?**
   - Standardizing the data ensures that all features have the same scale, which improves the convergence of the model and prevents some features
   from dominating others due to different magnitudes.

---

```python
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
```

**Explanation**:
- `train_test_split` splits the dataset into 80% for training and 20% for testing, ensuring that the model is evaluated on unseen data.
- `random_state=42` ensures that the split is reproducible.

**Viva Question**:
1. **What does `train_test_split` do?**
   - It splits the dataset into training and testing sets for model training and evaluation, ensuring that the model can generalize to new data.

---

```python
encoder = models.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(16, activation="relu"),
])
```

**Explanation**:
- `Sequential()` defines a simple feedforward neural network model.
- `layers.Input(shape=(x_train.shape[1],))` specifies the input shape to be the number of features in `x_train`.
- `Dense(64, activation="relu")`, `Dense(32, activation="relu")`, `Dense(16, activation="relu")`: 
Fully connected layers with ReLU activation to learn increasingly abstract representations.

**Viva Questions**:
1. **Why use `Sequential` here?**
   - `Sequential` is used to create a simple linear stack of layers, where each layer has exactly one input and one output.

2. **Why use `ReLU` activation?**
   - ReLU (Rectified Linear Unit) introduces non-linearity and helps the model to learn complex 
   patterns by allowing positive values to pass through while blocking negative values.

---

```python
decoder = models.Sequential([
    layers.Input((16,)),
    layers.Dense(32, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(x_train.shape[1], activation="linear"),
])
```

**Explanation**:
- `decoder` is the second part of the autoencoder.
- It takes the 16-dimensional latent space (from the encoder) and reconstructs it back to the original input shape.
- The output layer has a linear activation function because the model is reconstructing the input data.

**Viva Question**:
1. **What is the purpose of the decoder in the autoencoder?**
   - The decoder reconstructs the input data from the compressed representation in the latent space to 
     learn the feature distribution of the input data.

---

```python
autoencoder = models.Sequential([
    encoder,
    decoder
])
```

**Explanation**:
- Combines the encoder and decoder to form the complete autoencoder model.

**Viva Question**:
1. **What is an autoencoder and why combine encoder and decoder?**
   - An autoencoder learns to compress data (encoder) and then reconstruct it (decoder). Combining both components 
    forms a complete model that learns feature representations.

---

```python
autoencoder.compile(optimizer="adam", loss="mean_squared_error", metrics=['accuracy'])
```

**Explanation**:
- `optimizer="adam"`: Uses the Adam optimizer, which adapts the learning rate during training.
- `loss="mean_squared_error"`: The loss function measures the error between the original and reconstructed input. 
  It is suitable for autoencoders that perform reconstruction.
- `metrics=['accuracy']`: Tracks accuracy during training.

**Viva Question**:
1. **Why use `mean_squared_error` as the loss function?**
   - It measures the squared difference between the original input and the reconstructed input, which is ideal for reconstructing numerical data.

---

```python
test_acc, test_loss = autoencoder.evaluate(x_test, x_test)
print("Test Loss: ", test_loss)
print("Accuracy ", test_acc)
```

**Explanation**:
- Evaluates the autoencoder on the test data. Since this is an unsupervised problem (anomaly detection), both input and output are the same (`x_test`).
- `test_loss` represents how well the autoencoder can reconstruct the test data.

**Viva Question**:
1. **What does `autoencoder.evaluate(x_test, x_test)` return?**
   - It returns the loss and accuracy of the autoencoder on the test data, which helps assess the model's performance in reconstructing the input data.

---

```python
predictions = autoencoder.predict(x_test)
```

**Explanation**:
- Uses the trained autoencoder to make predictions (reconstruct input data) on the test set.

---

```python
mse = np.mean((x_test - predictions)**2, axis=1)
```

**Explanation**:
- Calculates the Mean Squared Error (MSE) for each sample in the test set, comparing the original data (`x_test`) 
 with the reconstructed data (`predictions`).

---

```python
threshold = np.percentile(mse, 95)
```

**Explanation**:
- Sets a threshold based on the 95th percentile of the MSE. Anomalies are defined as data points with MSE higher than this threshold.

---

```python
outliers = mse > threshold
```

**Explanation**:
- Identifies outliers where MSE exceeds the threshold, marking them as anomalies.

---

```python
print("Confusion Matrix:\n", confusion_matrix(y_test, outliers))
print("Classification report:\n", classification_report(y_test, outliers))
```

**Explanation**:
- `confusion_matrix` evaluates the model's ability to classify the anomalies correctly.
- `classification_report` provides precision, recall, and F1-score for each class.

---

```python
num_anomalies = np.sum(y_test[outliers] == 1)
print(f'Number of anomalies: {num_anomalies}')
```

**Explanation**:
- Counts how many actual fraudulent transactions (`Class == 1`) are detected as anomalies.

---

```python
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('MSLE Loss')
plt.legend(['loss', 'val_loss'])
plt.show()
```

**Explanation**:
- Plots the training and validation loss curves to visualize how the model's loss

 evolves over epochs.

---

### Potential Viva Questions:

1. **What is an autoencoder, and how does it work?**
   - An autoencoder learns to compress data into a lower-dimensional representation (encoding) and 
   then reconstruct it back to the original input (decoding). It's useful for anomaly detection, where abnormal 
   patterns can be identified by reconstruction error.

2. **Why is `mse` used to detect anomalies?**
   - MSE measures how much the reconstructed data differs from the original input. A high MSE indicates 
     that the data point is significantly different from the norm, signaling an anomaly.

3. **What does the threshold represent in this context?**
   - The threshold is set at the 95th percentile of MSE values, meaning that only the top 5% of the data points 
   (with the highest reconstruction error) are considered anomalies.

4. **Why do we use `classification_report` and `confusion_matrix`?**
   - These are used to evaluate how well the model identifies anomalies (fraudulent transactions), 
   showing the model's precision, recall, and F1 score.

---

This detailed breakdown will help in preparing for the viva by understanding the role of each line and its significance in the context of anomaly detection using autoencoders.
'''