In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import h5py
import numpy as np

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Frauddata_pca.csv'
df = pd.read_csv(file_path)

# Drop rows where the target is NaN
df = df.dropna(subset=['IsFraud'])

# Split the data into features and target
X = df.drop(columns=['IsFraud'])
y = df['IsFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a SimpleImputer instance to handle missing values in features
imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'most_frequent'

# Create a PCA instance
pca = PCA(n_components=10)  # Adjust the number of components as needed

# Create a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline with SimpleImputer, PCA, and Random Forest
pipeline = Pipeline(steps=[('imputer', imputer), ('pca', pca), ('rf', rf)])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

# Save the model to .pkl format
pkl_filename = 'fraud_d1.pkl'
joblib.dump(pipeline, pkl_filename)

# Save the model and PCA components to .h5 format
h5_filename = 'fraud_d1.h5'
with h5py.File(h5_filename, 'w') as h5file:
    # Save the PCA components
    h5file.create_dataset('pca_components', data=pca.components_)
    h5file.create_dataset('pca_explained_variance', data=pca.explained_variance_)
    h5file.create_dataset('pca_mean', data=pca.mean_)
    h5file.create_dataset('pca_variance_ratio', data=pca.explained_variance_ratio_)

    # Save the Random Forest model using joblib.dump
    joblib.dump(rf, '/content/drive/MyDrive/Colab Notebooks/random_forest1.pkl')

    # Read the model bytes and save to HDF5 dataset
    with open('/content/drive/MyDrive/Colab Notebooks/random_forest1.pkl', 'rb') as f:
        model_bytes = f.read()

    # Convert model bytes to a numpy array of type 'S1' (fixed-width ASCII strings)
    model_bytes_np = np.frombuffer(model_bytes, dtype='S1')

    # Create HDF5 dataset for model bytes
    h5file.create_dataset('random_forest', data=model_bytes_np)

# Verify PCA transformation
X_test_pca = pca.transform(imputer.transform(X_test))
print(f"PCA-transformed test data shape: {X_test_pca.shape}")

# Load the model and test the loaded model
with h5py.File(h5_filename, 'r') as h5file:
    # Load PCA components
    pca_components = h5file['pca_components'][:]
    pca_explained_variance = h5file['pca_explained_variance'][:]
    pca_mean = h5file['pca_mean'][:]
    pca_variance_ratio = h5file['pca_variance_ratio'][:]

    # Recreate PCA
    pca_loaded = PCA(n_components=10)
    pca_loaded.components_ = pca_components
    pca_loaded.explained_variance_ = pca_explained_variance
    pca_loaded.mean_ = pca_mean
    pca_loaded.explained_variance_ratio_ = pca_variance_ratio

    # Load Random Forest model
    model_bytes_np = h5file['random_forest'][:]
    model_bytes = model_bytes_np.tobytes()

    with open('/content/drive/MyDrive/Colab Notebooks/random_forest1_loaded.pkl', 'wb') as f:
        f.write(model_bytes)

    rf_loaded = joblib.load('/content/drive/MyDrive/Colab Notebooks/random_forest1_loaded.pkl')

    # Verify PCA transformation and model prediction
    X_test_pca_loaded = pca_loaded.transform(imputer.transform(X_test))
    y_pred_loaded = rf_loaded.predict(X_test_pca_loaded)

    # Calculate accuracy of the loaded model
    accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
    print(f"Accuracy of loaded model: {accuracy_loaded}")

    # Generate confusion matrix of the loaded model
    conf_matrix_loaded = confusion_matrix(y_test, y_pred_loaded)
    print(f"Confusion Matrix of loaded model:\n{conf_matrix_loaded}")

Accuracy: 0.850833767587285
Confusion Matrix:
[[2069  690]
 [ 455 4462]]
PCA-transformed test data shape: (7676, 10)
Accuracy of loaded model: 0.850833767587285
Confusion Matrix of loaded model:
[[2069  690]
 [ 455 4462]]


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import h5py
import numpy as np

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Frauddata.csv'
df = pd.read_csv(file_path)

# Drop rows where the target is NaN
df = df.dropna(subset=['IsFraud'])

# Split the data into features and target
X = df.drop(columns=['IsFraud'])
y = df['IsFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a SimpleImputer instance to handle missing values in features
imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'most_frequent'

# Create a PCA instance
pca = PCA(n_components=10)  # Adjust the number of components as needed

# Create a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline with SimpleImputer, PCA, and Random Forest
pipeline = Pipeline(steps=[('imputer', imputer), ('pca', pca), ('rf', rf)])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

# Save the model to .pkl format
pkl_filename = 'fraud_d1.pkl'
joblib.dump(pipeline, pkl_filename)

# Save the model and PCA components to .h5 format
h5_filename = 'fraud_d1.h5'
with h5py.File(h5_filename, 'w') as h5file:
    # Save the PCA components
    h5file.create_dataset('pca_components', data=pca.components_)
    h5file.create_dataset('pca_explained_variance', data=pca.explained_variance_)
    h5file.create_dataset('pca_mean', data=pca.mean_)
    h5file.create_dataset('pca_variance_ratio', data=pca.explained_variance_ratio_)

    # Save the Random Forest model using joblib.dump
    joblib.dump(rf, 'random_forest1.pkl')

    # Read the model bytes and save to HDF5 dataset
    with open('random_forest1.pkl', 'rb') as f:
        model_bytes = f.read()

    # Convert model bytes to a numpy array of type 'S1' (fixed-width ASCII strings)
    model_bytes_np = np.frombuffer(model_bytes, dtype='S1')

    # Create HDF5 dataset for model bytes
    h5file.create_dataset('random_forest', data=model_bytes_np)

# Verify PCA transformation
X_test_pca = pca.transform(imputer.transform(X_test))
print(f"PCA-transformed test data shape: {X_test_pca.shape}")

# Load the model and test the loaded model
with h5py.File(h5_filename, 'r') as h5file:
    # Load PCA components
    pca_components = h5file['pca_components'][:]
    pca_explained_variance = h5file['pca_explained_variance'][:]
    pca_mean = h5file['pca_mean'][:]
    pca_variance_ratio = h5file['pca_variance_ratio'][:]

    # Recreate PCA
    pca_loaded = PCA(n_components=10)
    pca_loaded.components_ = pca_components
    pca_loaded.explained_variance_ = pca_explained_variance
    pca_loaded.mean_ = pca_mean
    pca_loaded.explained_variance_ratio_ = pca_variance_ratio

    # Load Random Forest model
    model_bytes_np = h5file['random_forest'][:]
    model_bytes = model_bytes_np.tobytes()

    with open('random_forest1_loaded.pkl', 'wb') as f:
        f.write(model_bytes)

    rf_loaded = joblib.load('random_forest1_loaded.pkl')

    # Verify PCA transformation and model prediction
    X_test_pca_loaded = pca_loaded.transform(imputer.transform(X_test))
    y_pred_loaded = rf_loaded.predict(X_test_pca_loaded)

    # Calculate accuracy of the loaded model
    accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
    print(f"Accuracy of loaded model: {accuracy_loaded}")

    # Generate confusion matrix of the loaded model
    conf_matrix_loaded = confusion_matrix(y_test, y_pred_loaded)
    print(f"Confusion Matrix of loaded model:\n{conf_matrix_loaded}")

# Load the trained pipeline
pipeline = joblib.load('fraud_d1.pkl')

# Feature columns used in the original dataset
feature_columns = [
    'TransactionID', 'CardNumber', 'CardID', 'Amount', 'AvgTransactionAmount', 'Last1TransactionAmount',
    'Last2TransactionAmount', 'Last3TransactionAmount', 'Last4TransactionAmount', 'Last5TransactionAmount',
    'WrongPasswordAttempts', 'MultipleSwipes', 'HighValueTransactions', 'FestivalTime', 'OfferPeriod',
    'TransactionCountLastHour', 'TransactionCountLastDay', 'TransactionCountLastWeek', 'DifferentBillingAddress',
    'ExpiryYear'
]

# Get transaction details from the user
print("Enter transaction details:")
transaction_details = {}
for col in feature_columns:
    transaction_details[col] = float(input(f"{col}: "))

# Convert the transaction details into a DataFrame
transaction_df = pd.DataFrame([transaction_details])

# Ensure the DataFrame columns are in the correct order
transaction_df = transaction_df[feature_columns]

print("User input as DataFrame:")
print(transaction_df)

# Extract individual components from the pipeline
imputer = pipeline.named_steps['imputer']
pca = pipeline.named_steps['pca']
rf = pipeline.named_steps['rf']

# Preprocess the user input using the components
transaction_imputed = imputer.transform(transaction_df)
transaction_pca = pca.transform(transaction_imputed)

# Predict fraud for the new transaction
is_fraud = rf.predict(transaction_pca)

# Print the prediction result
if is_fraud[0] == 1:
    print("The transaction is predicted to be fraudulent.")
else:
    print("The transaction is predicted to be not fraudulent.")


Accuracy: 0.6307972902553414
Confusion Matrix:
[[1276 1483]
 [1351 3566]]
PCA-transformed test data shape: (7676, 10)
Accuracy of loaded model: 0.6307972902553414
Confusion Matrix of loaded model:
[[1276 1483]
 [1351 3566]]
Enter transaction details:
TransactionID: 5678738
CardNumber: 67543000000000
CardID: 8769
Amount: 123
AvgTransactionAmount: 123.5
Last1TransactionAmount: 123
Last2TransactionAmount: 123
Last3TransactionAmount: 123
Last4TransactionAmount: 123
Last5TransactionAmount: 123
WrongPasswordAttempts: 0
MultipleSwipes: 0
HighValueTransactions: 0
FestivalTime: 0
OfferPeriod: 0
TransactionCountLastHour: 0
TransactionCountLastDay: 1
TransactionCountLastWeek: 1
DifferentBillingAddress: 0
ExpiryYear: 2029
User input as DataFrame:
   TransactionID    CardNumber  CardID  Amount  AvgTransactionAmount  \
0      5678738.0  6.754300e+13  8769.0   123.0                 123.5   

   Last1TransactionAmount  Last2TransactionAmount  Last3TransactionAmount  \
0                   123.0        