In [None]:
import sys
import subprocess

def install_packages():
    packages = [
        'psycopg2-binary',  # PostgreSQL connector
        'umap-learn',       # UMAP for embeddings
        'scikit-learn',     # ML utilities
        'pandas',           # Data manipulation
        'numpy',            # Numerical operations
        'matplotlib',       # Plotting
        'seaborn'          # Enhanced plotting
    ]
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# install_packages()

# Import all necessary libraries
import numpy as np
import pandas as pd
import psycopg2
import json
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import umap
from urllib.parse import urlparse
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
print("✅ All dependencies installed and imported successfully!")


Installing psycopg2-binary...
Installing umap-learn...
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7
Installing scikit-learn...


  from .autonotebook import tqdm as notebook_tqdm


✅ All dependencies installed and imported successfully!


In [2]:
def connect_to_database():
    """Connect to PostgreSQL database"""
    try:
        conn = psycopg2.connect(
            host="localhost",
            database="vv8_backend",
            user="vv8",
            password="vv8",
            port=5434
        )
        print("✅ Successfully connected to database!")
        return conn
    except Exception as e:
        print(f"❌ Error connecting to database: {e}")
        return None

def load_data_from_db():
    """Load data from PostgreSQL database"""
    conn = connect_to_database()
    if conn is None:
        return None
    
    try:
        # Query to get all data from the table
        query = "SELECT * FROM multicore_static_info_known_companies;"
        
        print("Loading data from database...")
        df = pd.read_sql_query(query, conn)
        
        print(f"✅ Loaded {len(df)} records from database")
        print(f"Columns: {list(df.columns)}")
        
        conn.close()
        return df
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        if conn:
            conn.close()
        return None

# Load the data
df_raw = load_data_from_db()

if df_raw is not None:
    print(f"\nDataset shape: {df_raw.shape}")
    print(f"Sample columns: {df_raw.columns.tolist()[:10]}")
    
    # Show basic statistics
    if 'label' in df_raw.columns:
        label_counts = df_raw['label'].value_counts()
        print(f"\nLabel distribution:")
        print(f"  Negative (0): {label_counts.get(0, 0)}")
        print(f"  Positive (1): {label_counts.get(1, 0)}")


✅ Successfully connected to database!
Loading data from database...
✅ Loaded 2229 records from database
Columns: ['script_id', 'script_url', 'code', 'max_api_aggregation_score', 'behavioral_api_agg_count', 'fp_api_agg_count', 'max_aggregated_apis', 'max_behavioral_api_aggregation_score', 'aggregated_behavioral_apis', 'max_fingerprinting_api_aggregation_score', 'aggregated_fingerprinting_apis', 'attached_listeners', 'fingerprinting_source_apis', 'behavioral_source_apis', 'behavioral_source_api_count', 'fingerprinting_source_api_count', 'behavioral_apis_access_count', 'fingerprinting_api_access_count', 'graph_construction_failure', 'dataflow_to_sink', 'apis_going_to_sink', 'submission_url', 'label', 'vendor']

Dataset shape: (2229, 24)
Sample columns: ['script_id', 'script_url', 'code', 'max_api_aggregation_score', 'behavioral_api_agg_count', 'fp_api_agg_count', 'max_aggregated_apis', 'max_behavioral_api_aggregation_score', 'aggregated_behavioral_apis', 'max_fingerprinting_api_aggregatio