In [None]:
# Import necessary libraries
import pycountry
import numpy as np
import pandas as pd
import seaborn as sb
from tqdm.notebook import tqdm
from copy import deepcopy as copy
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.calibration import CalibratedClassifierCV as CCCV
from sklearn.model_selection import cross_validate, StratifiedKFold as SKF
from sklearn.metrics import precision_score, roc_auc_score, average_precision_score
import re
import warnings
import networkx as nx
from node2vec import Node2Vec

# Initialize tqdm for pandas
warnings.filterwarnings("ignore")
tqdm.pandas()
plt.style.use("dark_background")
pd.options.display.max_columns = 500

In [None]:
# Define functions for model training and visualization
def make_results_df(X, y, n_folds=10, base_estimator=RFC(n_estimators=500, random_state=42, n_jobs=-1)):
    skf = SKF(n_splits=n_folds, shuffle=True, random_state=42)
    results_df = pd.DataFrame(data=y, columns=["y"])
    results_df["y"] = results_df["y"].astype(str)
    fold_counter = 1
    for train_index, test_index in tqdm(skf.split(X, y), desc="folds"):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = CCCV(base_estimator=base_estimator, cv=skf, n_jobs=-1)
        clf.fit(X_train, y_train)
        y_prob = clf.predict_proba(X_test)[:, 1]
        results_df.loc[test_index, "prob"] = y_prob
        results_df.loc[test_index, "fold"] = str(fold_counter)
        fold_counter += 1
    return results_df

# Additional visualization functions

def plot_results(results_df, common_norm=True):
    """
    Plots the distribution of predicted probabilities for each class in the dataset.
    """
    sb.histplot(
        results_df,
        x="prob",
        hue="y",
        element="step",
        common_norm=common_norm,
        palette="Set2",
        bins=20,
    )
    plt.title("Predicted probabilities per class")
    plt.show()

def plot_precisions(results_df):
    """
    Plots the precision for different probability thresholds for each fold.
    """
    thresholds = np.arange(0, 1.1, 0.1)
    precisions = []
    for fold in results_df["fold"].unique():
        fold_results = results_df[results_df["fold"] == fold]
        fold_precisions = [
            precision_score(fold_results["y"], fold_results["prob"] > threshold)
            for threshold in thresholds
        ]
        precisions.append(fold_precisions)
    precisions = np.array(precisions)
    avg_precisions = precisions.mean(axis=0)

    plt.plot(thresholds, avg_precisions, marker="o", label="Average precision")
    for fold_idx, fold_prec in enumerate(precisions):
        plt.plot(thresholds, fold_prec, alpha=0.5, linestyle="--", label=f"Fold {fold_idx + 1}")
    plt.xlabel("Probability threshold")
    plt.ylabel("Precision")
    plt.title("Precision by Threshold per Fold")
    plt.legend()
    plt.show()

def plot_aucs(results_df):
    """
    Plots the AUC (Area Under Curve) for each fold and overall.
    """
    aucs = []
    for fold in results_df["fold"].unique():
        fold_results = results_df[results_df["fold"] == fold]
        auc = roc_auc_score(fold_results["y"].astype(int), fold_results["prob"])
        aucs.append(auc)

    plt.bar(range(1, len(aucs) + 1), aucs, alpha=0.7, label="AUC per fold")
    plt.axhline(y=np.mean(aucs), color="r", linestyle="--", label="Average AUC")
    plt.xlabel("Fold")
    plt.ylabel("AUC")
    plt.title("AUC per Fold and Average AUC")
    plt.legend()
    plt.show()

In [None]:


# Load datasets
df = pd.read_csv("../data/Douane_with_holmes_outcomes_unfolded_20230120.csv")
dfn = pd.read_csv("../data/DMF_for_networks_and_undummied.csv")
df.drop('Unnamed: 0', inplace=True, axis=1)

# Preprocessing: Create a unique key and drop duplicates
dfn['unique_key'] = dfn['EQP_NUMMER'] + '__' + dfn['AANKOMSTDATUM']
dfn.drop_duplicates('unique_key', keep='first', inplace=True)

# Filter relevant columns based on patterns
good_cols = ['NAAM', 'unique_key', 'AANKOMSTDATUM']
to_keep = [col for col in dfn.columns.tolist() if any(re.search(pattern, col) for pattern in good_cols)]
dfn = dfn[to_keep]

# Handle missing values
dfn.loc[dfn['NAAM_KG'].isnull(), 'NAAM_KG'] = 'MISSING_KG'
dfn.loc[dfn['NAAM_VG'].isnull(), 'NAAM_VG'] = 'MISSING_VG'
dfn.loc[dfn['NAAM_OG'].isnull(), 'NAAM_OG'] = 'MISSING_OG'
dfn.loc[dfn['NAAM_KG'] == 'SAME AS CONSIGNEE', 'NAAM_KG'] = dfn.loc[dfn['NAAM_KG'] == 'SAME AS CONSIGNEE', 'NAAM_OG']

# Build a network graph from data columns
G = nx.from_pandas_edgelist(dfn[['NAAM_VG', 'NAAM_OG', 'NAAM_KG', 'NAAM_VV', 'NAAM_VM']], 'NAAM_VM', 'NAAM_VG')
list_ots = ['NAAM_OG', 'NAAM_KG', 'NAAM_VV']
for ots in list_ots:
    G.add_edges_from(zip(dfn['NAAM_VM'], dfn[ots]))

# Generate node embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=16, walk_length=128, num_walks=10, workers=4, p=1, q=0.5)
mdl = node2vec.fit(vector_size=16, window=10, min_count=1)
emb_df = pd.DataFrame([mdl.wv.get_vector(str(n)) for n in G.nodes()], index=G.nodes)

# Multiply embeddings for feature aggregation
emb_dict = emb_df.to_dict(orient='index')
list_of_results = []
for _, row in dfn.iterrows():
    result = np.multiply(emb_dict[row['NAAM_VG']].values(), emb_dict[row['NAAM_KG']].values())
    result = np.multiply(result, emb_dict[row['NAAM_OG']].values())
    result = np.multiply(result, emb_dict[row['NAAM_VM']].values())
    result = np.multiply(result, emb_dict[row['NAAM_VV']].values())
    list_of_results.append(result)

# Prepare final dataframe for model training
resdf = pd.DataFrame(list_of_results)
new_cols = ['emb_' + str(col) for col in resdf.columns]
resdf.columns = new_cols
for col in resdf:
    dfn[col] = resdf[col]

dfn.set_index('unique_key', inplace=True)
df['unique_key'] = df['EQP_NUMMER'] + '__' + df['AANKOMSTDATUM']
df.set_index('unique_key', inplace=True)
df_final = pd.merge(df, dfn, left_index=True, right_index=True)
df_final.reset_index(inplace=True, drop=True)

# Process and encode trade type based on origin and destination
df["trade_type"] = df[["Prisma_incoming_country_origin", "Prisma_incoming_country_destination"]].apply(lambda row: "import" if row[1] == "Netherlands" else ("export" if row[0] == "Netherlands" else "transit"), axis=1)
columns = df.columns.tolist()
columns[-1] = "target"
columns[-2] = "trade_type"
df = df[columns]

# Train and evaluate the model for non-compliance and compliance detection
X_columns = df.columns[2:-1]
y_column = df.columns[-1]
X = pd.get_dummies(df[X_columns]).values
y = df[y_column].values
results_df = make_results_df(X, y, n_folds=10)
plot_results(results_df, common_norm=False)
