In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Big_Black_Money_Dataset.csv')

# Print the first few rows of the DataFrame
df.head()

Unnamed: 0,Transaction ID,Country,Amount (USD),Transaction Type,Date of Transaction,Person Involved,Industry,Destination Country,Reported by Authority,Source of Money,Money Laundering Risk Score,Shell Companies Involved,Financial Institution,Tax Haven Country
0,TX0000000001,Brazil,3267530.0,Offshore Transfer,2013-01-01 00:00:00,Person_1101,Construction,USA,True,Illegal,6,1,Bank_40,Singapore
1,TX0000000002,China,4965767.0,Stocks Transfer,2013-01-01 01:00:00,Person_7484,Luxury Goods,South Africa,False,Illegal,9,0,Bank_461,Bahamas
2,TX0000000003,UK,94167.5,Stocks Transfer,2013-01-01 02:00:00,Person_3655,Construction,Switzerland,True,Illegal,1,3,Bank_387,Switzerland
3,TX0000000004,UAE,386420.1,Cash Withdrawal,2013-01-01 03:00:00,Person_3226,Oil & Gas,Russia,False,Illegal,7,2,Bank_353,Panama
4,TX0000000005,South Africa,643378.4,Cryptocurrency,2013-01-01 04:00:00,Person_7975,Real Estate,USA,True,Illegal,1,9,Bank_57,Luxembourg


In [84]:
df['Money Laundering Risk Score'].unique()

array([ 6,  9,  1,  7,  3,  8,  5,  4, 10,  2])

In [85]:
df.drop(columns=['Person Involved', 'Financial Institution', "Transaction ID"], inplace=True)
df.head()

Unnamed: 0,Country,Amount (USD),Transaction Type,Date of Transaction,Industry,Destination Country,Reported by Authority,Source of Money,Money Laundering Risk Score,Shell Companies Involved,Tax Haven Country
0,Brazil,3267530.0,Offshore Transfer,2013-01-01 00:00:00,Construction,USA,True,Illegal,6,1,Singapore
1,China,4965767.0,Stocks Transfer,2013-01-01 01:00:00,Luxury Goods,South Africa,False,Illegal,9,0,Bahamas
2,UK,94167.5,Stocks Transfer,2013-01-01 02:00:00,Construction,Switzerland,True,Illegal,1,3,Switzerland
3,UAE,386420.1,Cash Withdrawal,2013-01-01 03:00:00,Oil & Gas,Russia,False,Illegal,7,2,Panama
4,South Africa,643378.4,Cryptocurrency,2013-01-01 04:00:00,Real Estate,USA,True,Illegal,1,9,Luxembourg


In [86]:
print(f"Anzahl Länder (Sender): {df['Country'].nunique()}")
print(f"Anzahl Länder (Empfänger): {df['Destination Country'].nunique()}")

print(f"Anzahl Länder (Sender): {df['Country'].unique()}")
print(f"Anzahl Länder (Empfänger): {df['Destination Country'].unique()}")

Anzahl Länder (Sender): 10
Anzahl Länder (Empfänger): 10
Anzahl Länder (Sender): ['Brazil' 'China' 'UK' 'UAE' 'South Africa' 'Russia' 'Switzerland' 'India'
 'USA' 'Singapore']
Anzahl Länder (Empfänger): ['USA' 'South Africa' 'Switzerland' 'Russia' 'Brazil' 'UK' 'India' 'China'
 'Singapore' 'UAE']


In [87]:
df = df.drop('Country', axis=1)
df = df.drop('Destination Country', axis=1)
df = df.join(pd.get_dummies(df['Transaction Type'], prefix='Transaction Type')).drop('Transaction Type', axis=1)
df = df.drop('Industry', axis=1)
df = df.drop('Tax Haven Country', axis=1)
df = df.join(pd.get_dummies(df['Source of Money'], prefix='Source of Money')).drop('Source of Money', axis=1)
df = df.join(pd.get_dummies(df['Reported by Authority'], prefix='Reported by Authority')).drop('Reported by Authority', axis=1)
df['Date of Transaction'] = pd.to_datetime(df['Date of Transaction'])
df['Date of Transaction'] = df['Date of Transaction'].astype(int) / 10**9  # Convert to Unix timestamp


df.head()





Unnamed: 0,Amount (USD),Date of Transaction,Money Laundering Risk Score,Shell Companies Involved,Transaction Type_Cash Withdrawal,Transaction Type_Cryptocurrency,Transaction Type_Offshore Transfer,Transaction Type_Property Purchase,Transaction Type_Stocks Transfer,Source of Money_Illegal,Source of Money_Legal,Reported by Authority_False,Reported by Authority_True
0,3267530.0,1356998000.0,6,1,False,False,True,False,False,True,False,False,True
1,4965767.0,1357002000.0,9,0,False,False,False,False,True,True,False,True,False
2,94167.5,1357006000.0,1,3,False,False,False,False,True,True,False,False,True
3,386420.1,1357009000.0,7,2,True,False,False,False,False,True,False,True,False
4,643378.4,1357013000.0,1,9,False,True,False,False,False,True,False,False,True


In [88]:
X = df.drop(columns=['Money Laundering Risk Score'])
y = df['Money Laundering Risk Score']

In [89]:
print(X.shape, y.shape)

(10000, 12) (10000,)


In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(8000, 12) (2000, 12)
(8000,) (2000,)


In [92]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialisiere den Baum
model = DecisionTreeClassifier(max_depth=5, random_state=42)

# Trainiere das Modell
model.fit(X_train, y_train)

# Vorhersagen auf Testdaten
y_pred = model.predict(X_test)

# Genauigkeit prüfen
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.10


In [94]:
importance = model.feature_importances_
for i, v in enumerate(importance):
    print(f"Feature: {X_train.columns[i]}, Importance: {v:.2f}")

Feature: Amount (USD), Importance: 0.41
Feature: Date of Transaction, Importance: 0.21
Feature: Shell Companies Involved, Importance: 0.19
Feature: Transaction Type_Cash Withdrawal, Importance: 0.00
Feature: Transaction Type_Cryptocurrency, Importance: 0.00
Feature: Transaction Type_Offshore Transfer, Importance: 0.04
Feature: Transaction Type_Property Purchase, Importance: 0.12
Feature: Transaction Type_Stocks Transfer, Importance: 0.00
Feature: Source of Money_Illegal, Importance: 0.00
Feature: Source of Money_Legal, Importance: 0.00
Feature: Reported by Authority_False, Importance: 0.00
Feature: Reported by Authority_True, Importance: 0.02
