In [1]:
# Install necessary packages
!pip install -U scikit-learn joblib streamlit


Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Collecting streamlit
  Downloading streamlit-1.30.0-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1,>=0.2 (from streamlit)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m61.5 MB/s[0m e

In [2]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import joblib
from google.colab import files
import streamlit as st


In [3]:
# Function to preprocess the data
def preprocess_data(data):
    # Select relevant features
    selected_features = ['age', 'transaction_amount',
                         'average_expenditure', 'comparison_with_avg_expenditure',
                         'transaction_count_7_days',
                         'Total Credit Amount','fraud_indicator']

    # Keep only the selected features
    data = data[selected_features]

    # Handle missing values
    data = data.dropna()

    # Label encoding for categorical variables
    label_encoder = LabelEncoder()
    data[data.select_dtypes(include=['object']).columns] = data.select_dtypes(include=['object']).apply(lambda col: label_encoder.fit_transform(col.astype(str)))

    return data, label_encoder

# Upload the dataset files
uploaded_files = files.upload()

# Load datasets
data1_path = '/content/TransactionDataset1.csv'
data2_path = '/content/credit-debit dataset.csv'

data1 = pd.read_csv(data1_path)
data2 = pd.read_csv(data2_path)


Saving credit-debit dataset.csv to credit-debit dataset.csv
Saving TransactionDataset1.csv to TransactionDataset1.csv


In [4]:
# Preprocess data
preprocessed_data, label_encoder = preprocess_data(pd.concat([data1, data2], axis=1))

# Separate features and target variable
X = preprocessed_data.drop('fraud_indicator', axis=1)
y = preprocessed_data['fraud_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select relevant features for training
selected_features_for_training = ['age', 'transaction_amount',
                                   'average_expenditure', 'comparison_with_avg_expenditure',
                                   'transaction_count_7_days',
                                   'Total Credit Amount', ]

# Use only the selected features for training
X_train = X_train[selected_features_for_training]
X_test = X_test[selected_features_for_training]

# Use SimpleImputer to handle missing values by filling NaNs with the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)



In [5]:
# Choose a model (Random Forest)
model = RandomForestClassifier(n_estimators=2000, random_state=42, verbose=1)

# Train the model
model.fit(X_train_scaled, y_train)

# Save the model, scaler, and label encoder
model_filename = '/content/random_forest_.pkl'
joblib.dump({
    'label_encoder': label_encoder,
    'scaler': scaler,
    'model': model,
    'features': selected_features_for_training  # Save the features used for training
}, model_filename)

print(f'Model, scaler, and label encoder saved as {model_filename}')

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Display the features used for training
print("Features used for training:")
for feature_name in selected_features_for_training:
    print(feature_name)

# Make predictions on the test set
X_test_selected = X_test[selected_features_for_training]
X_test_selected_imputed = imputer.transform(X_test_selected)
X_test_selected_scaled = scaler.transform(X_test_selected_imputed)

y_pred = model.predict(X_test_selected_scaled)

# Display classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    8.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:   19.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   34.0s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   53.8s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:  1.3min


Model, scaler, and label encoder saved as /content/random_forest_.pkl
Features used for training:
age
transaction_amount
average_expenditure
comparison_with_avg_expenditure
transaction_count_7_days
Total Credit Amount


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    1.0s


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      2562
           1       0.97      0.78      0.86      1438

    accuracy                           0.91      4000
   macro avg       0.93      0.88      0.90      4000
weighted avg       0.92      0.91      0.91      4000



[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    1.5s


In [18]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/232.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [29]:
# Save the Streamlit app code to a file
import subprocess
app_code = """
import pandas as pd
import numpy as np
import streamlit as st
import joblib
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from tempfile import NamedTemporaryFile
import os
import PyPDF2
import re

# Set up SQLite database connection
conn = sqlite3.connect('/content/user_data.db')
cursor = conn.cursor()

# Create a table for user data if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS user_data (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        age INTEGER,
        transaction_amount INTEGER,
        average_expenditure INTEGER,
        comparison_with_avg_expenditure INTEGER,
        transaction_count_7_days INTEGER,
        "Total Credit Amount" INTEGER,
        prediction INTEGER,
        timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
''')
conn.commit()

# Load the pre-trained model, scaler, and label encoder
model_data = joblib.load('/content/random_forest_.pkl')
label_encoder = model_data['label_encoder']
scaler = model_data['scaler']
model = model_data['model']
features = model_data['features']

# Function to extract data from PDF
def extract_data_from_pdf(pdf_filename):
    extracted_data = {}

    with open(pdf_filename, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()

            # Define regular expressions for each attribute
            attribute_patterns = {
                "age": r"age:\s*(\d+)",
                "transaction_amount": r"transaction_amount:\s*([\d.]+)",
                "average_expenditure": r"average_expenditure:\s*([\d.]+)",
                "comparison_with_avg_expenditure": r"comparison_with_avg_expenditure:\s*([\d.]+)",
                "transaction_count_7_days": r"transaction_count_7_days:\s*(\d+)",
                "Total Credit Amount": r"Total Credit Amount:\s*([\d.]+)",
            }

            # Extract values using regular expressions
            for feature, pattern in attribute_patterns.items():
                match = re.search(pattern, text)
                if match:
                    extracted_data[feature] = match.group(1)

    return extracted_data

# Streamlit app code
st.title('Fraud Detection App')

# File upload section
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

# Display user input or uploaded file data
if uploaded_file is not None:
    # Extract data from the uploaded PDF
    temp_pdf = NamedTemporaryFile(delete=False)
    temp_pdf.write(uploaded_file.read())
    temp_pdf.close()

    extracted_data = extract_data_from_pdf(temp_pdf.name)

    # Display the extracted data
    st.subheader('Uploaded PDF Data:')
    uploaded_data = pd.DataFrame(extracted_data, index=[0])
    st.table(uploaded_data)

    # Use the extracted data to fill the input space
    user_input = {}
    for feature in features:
        if feature != 'fraud_indicator' and feature in extracted_data:
            user_input[feature] = st.number_input(f'Enter {feature}', step=1, value=int(float(extracted_data[feature])))
        else:
            user_input[feature] = st.number_input(f'Enter {feature}', step=1, value=0)

    # Display user input
    st.subheader('User Input:')
    user_data_input = pd.DataFrame(user_input, index=[0])
    st.table(user_data_input)

# Display user input space if no PDF is uploaded
else:
    st.subheader('Enter Transaction Details:')
    user_input = {}

    for feature in features:
        if feature != 'fraud_indicator':
            user_input[feature] = st.number_input(f'Enter {feature}', step=1, value=0)

    # Display user input
    st.subheader('User Input:')
    user_data_input = pd.DataFrame(user_input, index=[0])
    st.table(user_data_input)

# Make prediction
user_data_processed = user_data_input.copy()  # No need to preprocess for SQLite
user_data_scaled = scaler.transform(user_data_processed)
prediction = model.predict(user_data_scaled)

# Insert user input and prediction into the database
if st.button('Submit'):
    cursor.execute('''
        INSERT INTO user_data (
            age,
            transaction_amount,
            average_expenditure,
            comparison_with_avg_expenditure,
            transaction_count_7_days,
            "Total Credit Amount",
            prediction
        )
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (
        user_input['age'],
        user_input['transaction_amount'],
        user_input['average_expenditure'],
        user_input['comparison_with_avg_expenditure'],
        user_input['transaction_count_7_days'],
        user_input['Total Credit Amount'],
        prediction[0]
    ))
    conn.commit()
    st.success('Data submitted successfully!')

    # Display prediction for fraud detection
    st.subheader('Prediction for Fraud Detection:')
    if prediction[0] == 1:
        st.warning('This transaction is flagged as potentially fraudulent!')
    else:
        st.success('This transaction is not flagged as fraudulent!')

# Streamlit app code for visualization on the sidebar
st.sidebar.title('Fraud Detection App - Visualization')

# Display user IDs on the sidebar
user_ids = pd.read_sql_query('SELECT id FROM user_data ORDER BY timestamp DESC LIMIT 5', conn)['id'].tolist()
selected_user_id = st.sidebar.selectbox('Select User ID:', user_ids)

# Retrieve data for the selected user ID
selected_data = pd.read_sql_query(f'SELECT * FROM user_data WHERE id={selected_user_id}', conn)

# Display selected user data on the sidebar
st.sidebar.subheader(f'Selected User Data (User ID: {selected_user_id}):')
st.sidebar.table(selected_data)

# Visualization - Transaction Amount Excess Chart
if selected_data.shape[0] > 0:
    st.sidebar.subheader('Transaction Amount Excess Chart')

    # Threshold value for transaction amount
    transaction_amount_threshold = 110000

    # Excess transactions
    excess_transactions = max(0, selected_data['transaction_amount'].values[0] - transaction_amount_threshold)

    # Visualization
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.barh(['Excess Transactions', 'Remaining Transactions'], [excess_transactions, transaction_amount_threshold - excess_transactions], color=['red', 'green'])
    ax.set_xlabel('Number of Transactions')
    ax.set_title('Transaction Amount Excess Chart')
    st.sidebar.pyplot(fig)
    st.sidebar.write(f'Transaction Amount Threshold: {transaction_amount_threshold}')

# Visualization - Comparison with Avg Expenditure Excess Chart
if selected_data.shape[0] > 0:
    st.sidebar.subheader('Comparison with Avg Expenditure Excess Chart')

    # Threshold value for comparison with avg expenditure
    comparison_with_avg_expenditure_threshold = 30000

    # Excess value
    excess_value = max(0, selected_data['comparison_with_avg_expenditure'].values[0] - comparison_with_avg_expenditure_threshold)

    # Visualization
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.barh(['Excess Value', 'Remaining Value'], [excess_value, comparison_with_avg_expenditure_threshold - excess_value], color=['red', 'green'])
    ax.set_xlabel('Value')
    ax.set_title('Comparison with Avg Expenditure Excess Chart')
    st.sidebar.pyplot(fig)
    st.sidebar.write(f'Comparison with Avg Expenditure Threshold: {comparison_with_avg_expenditure_threshold}')

# Visualization - Total Credit Amount Excess Chart
if selected_data.shape[0] > 0:
    st.sidebar.subheader('Total Credit Amount Excess Chart')

    # Threshold value for total credit amount
    total_credit_amount_threshold = 150000

    # Excess value
    excess_value_credit = max(0, selected_data['Total Credit Amount'].values[0] - total_credit_amount_threshold)

    # Visualization
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.barh(['Excess Value', 'Remaining Value'], [excess_value_credit, total_credit_amount_threshold - excess_value_credit], color=['red', 'green'])
    ax.set_xlabel('Value')
    ax.set_title('Total Credit Amount Excess Chart')
    st.sidebar.pyplot(fig)
    st.sidebar.write(f'Total Credit Amount Threshold: {total_credit_amount_threshold}')

# Close the database connection when done
conn.close()

# Sidebar
st.sidebar.title('Additional Information')
st.sidebar.markdown('This Streamlit app is for demonstration purposes only.')
"""

with open('/content/streamlit_app.py', 'w') as f:
    f.write(app_code)


In [14]:
# Download ngrok binary for Linux
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

# Unzip the downloaded file
!unzip ngrok-stable-linux-amd64.zip

# Move ngrok to /usr/local/bin (or any directory in your PATH)
!sudo mv ngrok /usr/local/bin/

# Clean up the downloaded files (optional)
!rm ngrok-stable-linux-amd64.zip


--2024-01-17 22:33:56--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 18.205.222.128, 54.237.133.81, 52.202.168.65, ...
Connecting to bin.equinox.io (bin.equinox.io)|18.205.222.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13921656 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2024-01-17 22:33:57 (22.2 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13921656/13921656]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [16]:
!ngrok authtoken 2b2dbgzgUwpEFc6LV0p39tU1zJR_3sMRYo8mQp3KCYbxNYMh



Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [12]:
!ngrok config add-authtoken 2b2dbgzgUwpEFc6LV0p39tU1zJR_3sMRYo8mQp3KCYbxNYMh

/bin/bash: line 1: ngrok: command not found


In [None]:
import subprocess
from multiprocessing import Process

# Function to run a command in a separate process and capture output
def run_command(command):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    output = stdout.decode('utf-8') + stderr.decode('utf-8')
    print(f"Command: {command}\nOutput: {output}")
    return output

# Define the commands
streamlit_command = ['streamlit', 'run', '/content/streamlit_app.py']
localtunnel_command = ['npx', 'localtunnel', '--port', '8501']
ngrok_command = ['ngrok', 'http', '8501']

# Run each command in a separate process
processes = [
    Process(target=run_command, args=(streamlit_command,)),
    Process(target=run_command, args=(localtunnel_command,)),
    Process(target=run_command, args=(ngrok_command,))
]

# Start all processes
for process in processes:
    process.start()

# Wait for all processes to finish
for process in processes:
    process.join()
