# EXPLORED EDA - NLP

## IMPORTS

In [28]:
# ---------------------------------------------------------
# GENERAL IMPORTS
# ---------------------------------------------------------
# CORE PYTHON & DATA MANIPULATION LIBRARIES
import numpy as np                # Numerical computations, arrays, math operations
import pandas as pd               # Data handling, DataFrames, time-series structures
import warnings                   # Warning control and suppression
import math                       # Math utilities (sqrt, log, floor, ceil, etc.)
# VISUALIZATION LIBRARIES
import matplotlib.pyplot as plt   # Main plotting library
import seaborn as sns             # Statistical and enhanced visualization tools
# DATA SPLIT UTILITIES
from sklearn.model_selection import train_test_split   # Split dataset into train / test subsets
# MODEL SELECTION
from sklearn.model_selection import GridSearchCV       # Hyperparameter optimization via grid search
from sklearn.tree import plot_tree                     # Visualization of decision tree structures
from pickle import dump                                # Save trained models to disk (serialization)

# ---------------------------------------------------------
# CLASSIFICATION ALGORITHMS
# ---------------------------------------------------------
# METRICS
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score   # Core classification metrics
# PREDICTION MODELS
from sklearn.linear_model import LogisticRegression     # Logistic regression classifier
from sklearn.tree import DecisionTreeClassifier         # Decision tree classifier
from sklearn.naive_bayes import GaussianNB              # Gaussian Naive Bayes for continuous inputs
from sklearn.naive_bayes import MultinomialNB           # Multinomial Naive Bayes (common for NLP)
from sklearn.naive_bayes import BernoulliNB             # Bernoulli Naive Bayes (binary features)
# BAGGING ENSEMBLE
from sklearn.ensemble import RandomForestClassifier     # Ensemble of decision trees (bagging)
# BOOSTING ENSEMBLE
from sklearn.ensemble import AdaBoostClassifier         # AdaBoost boosting algorithm
from sklearn.ensemble import GradientBoostingClassifier # Gradient boosting classifier
from xgboost import XGBClassifier                       # Extreme Gradient Boosting (high-performance)
from lightgbm import LGBMClassifier                     # LightGBM (optimized gradient boosting)

# ---------------------------------------------------------
# REGRESSION ALGORITHMS
# ---------------------------------------------------------
# METRICS
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score   # Regression performance metrics
# PREDICTION MODELS
from sklearn.linear_model import LinearRegression       # Linear regression model
from sklearn.linear_model import Lasso                  # L1 regularized regression
from sklearn.linear_model import Ridge                  # L2 regularized regression
from sklearn.tree import DecisionTreeRegressor          # Regression decision tree
# BAGGING ENSEMBLE
from sklearn.ensemble import RandomForestRegressor      # Ensemble of regression trees (bagging)
# BOOSTING ENSEMBLE
from sklearn.ensemble import AdaBoostRegressor          # Boosting algorithm for regression
from sklearn.ensemble import GradientBoostingRegressor  # Gradient boosting regressor
from xgboost import XGBRegressor                        # XGBoost regressor
from lightgbm import LGBMRegressor                      # LightGBM regressor

# ---------------------------------------------------------
# NLP DATASETS - EDA
# ---------------------------------------------------------
import regex as re


# STEP 13) VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer     # Convert text into token frequency counts
from sklearn.feature_extraction.text import TfidfVectorizer     # Convert text into TF-IDF weighted features
from sklearn.decomposition import PCA                           # Dimensionality reduction (e.g., for visualization)
from sklearn.metrics import silhouette_score                    # Clustering quality metric (optional for NLP clustering)

# ---------------------------------------------------------
# TIME-SERIES DATASETS - EDA
# ---------------------------------------------------------
# STEP 3) DECOMPOSING
from statsmodels.tsa.seasonal import seasonal_decompose         # Decompose time-series into trend, seasonal, and residual components
# STEP 4) STATIONARITY ANALYSIS
from statsmodels.tsa.stattools import adfuller                  # Dickey-Fuller test for stationarity evaluation
# STEP 5) VARIABILITY ANALYSIS
from statsmodels.stats.diagnostic import acorr_ljungbox         # Ljung-Box test for checking autocorrelation in residuals
# STEP 6) AUTOCORRELATION ANALYSIS
from statsmodels.tsa.stattools import acf                       # Compute autocorrelation values  
from statsmodels.tsa.stattools import pacf                      # Compute partial autocorrelation values
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf   # Plot ACF & PACF with confidence bands
# STEP 16) PREDICTION MODELS
from statsmodels.tsa.arima.model import ARIMA                   # ARIMA(p,d,q) model for forecasting
from pmdarima import auto_arima                                 # Automatic ARIMA/SARIMA parameter selection
from statsmodels.tools.sm_exceptions import ConvergenceWarning  # Warning raised when ARIMA optimizer fails
warnings.filterwarnings("ignore", category=ConvergenceWarning)  # Suppress convergence warnings globally (keeps logs clean)

# ---------------------------------------------------------
# TABULAR DATASETS - EDA
# ---------------------------------------------------------
# STEP 10) REMOVE NOISY ATTRIBUTES
from scipy.stats import chi2_contingency                # Chi-square test for categorical dependencies
# STEP 12) SCALLING
from sklearn.preprocessing import StandardScaler        # Standardization (mean=0, std=1)
from sklearn.preprocessing import MinMaxScaler          # Min-max scaling to [0,1]
# STEP 13) ENCODING
from sklearn.preprocessing import LabelEncoder          # Encode categories into integer labels
from sklearn.preprocessing import OneHotEncoder         # Encode categories into binary vectors
# STEP 14) FEATURE SELECTION
from sklearn.feature_selection import SelectKBest, f_classif   # Univariate feature selection for classification

## HELPER FUNCTIONS

In [20]:
# ---------------------------------------------------------
# COMMON
# ---------------------------------------------------------
# Log printer
def log(message: str, level: int = 1, type: str = "INFO", custom_icon: str = None, bold: bool = False):
    # Default icons according to message type
    icons = {
        "INFO": "‚ÑπÔ∏è",
        "FOUND": "üîç",
        "SUCCESS": "‚úÖ",
        "ERROR": "‚ùå",
        "WARNING": "‚ö†Ô∏è",
    }
    # Use custom icon if provided
    if custom_icon:
        icon = custom_icon
    else:
        icon = icons.get(type.upper(), "‚ÑπÔ∏è")
    # Bold wrapper (ANSI)
    if bold:
        message = f"\033[1m{message}\033[0m"
    # First level ‚Üí bullet
    if level == 1:
        prefix = "‚Ä¢"
    # Second level ‚Üí indent + hyphen
    elif level == 2:
        prefix = "   -"
    # Level 3 ‚Üí deeper indent + middle dot
    elif level == 3:
        prefix = "      ¬∑"
    # Fallback
    else:
        prefix = "-"
    # Final print
    print(f"{prefix} {icon} {message}")

# -------------------------------
# NLP - EDA
# -------------------------------
# Adaptive text preprocessing: for natural text, URLs, and mixed content
def preprocess_text(text,
                    mode="auto",                # "auto", "text", "url"
                    lowercase_text=True,
                    remove_urls=True,           # Only applies in text mode
                    remove_emails=True,
                    remove_html_tags=True,
                    remove_non_letters=True,    # Only applies in text mode
                    remove_single_char_tokens=True,
                    reduce_whitespace=True,
                    tokenize_output=True):
    # Convert to string safely
    if text is None:
        return {"output": [], "mode_used": "none"}
    text = str(text)
    # Automatic URL detection
    def looks_like_url(s):
        if s.startswith("http://") or s.startswith("https://"):
            return True
        if re.search(r"\.[a-z]{2,4}(/|$)", s):   # domain-like pattern
            return True
        return False
    # Decide mode
    if mode == "auto":
        is_url = looks_like_url(text)
    elif mode == "url":
        is_url = True
    else:
        is_url = False
    # URL MODE
    if is_url:
        mode_used = "url"
        if lowercase_text:
            text = text.lower()
        # Split URL into meaningful tokens
        text = re.sub(r'[:/\.\?\=\&\-\_#]+', ' ', text)
        if remove_single_char_tokens:
            text = re.sub(r'\b[a-z]\b', ' ', text)
        if reduce_whitespace:
            text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        if tokenize_output:
            tokens = text.split()
            clean_tokens = []
            for tok in tokens:
                if tok:
                    clean_tokens.append(tok)
            return {"output": clean_tokens, "mode_used": mode_used}
        return {"output": text, "mode_used": mode_used}
    # TEXT MODE
    mode_used = "text"
    if lowercase_text:
        text = text.lower()
    if remove_urls:
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    if remove_emails:
        text = re.sub(r'\S+@\S+\.\S+', ' ', text)
    if remove_html_tags:
        text = re.sub(r'<.*?>', ' ', text)
    if remove_non_letters:
        text = re.sub(r'[^a-z\s]', ' ', text)
    if remove_single_char_tokens:
        text = re.sub(r'\b[a-z]\b', ' ', text)
    if reduce_whitespace:
        text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    if tokenize_output:
        tokens = text.split()
        return {"output": tokens, "mode_used": mode_used}
    return {"output": text, "mode_used": mode_used}

# ---------------------------------------------------------
# TIME-SERIES DATASETS - EDA
# ---------------------------------------------------------
# Determines granularity given seconds
def determine_granularity(seconds: float):
    if seconds < 1:
        granularity = "sub-second"
    elif seconds >= 1 and seconds < 60:
        granularity = "second"
    elif seconds >= 60 and seconds < 3600:
        granularity = "minute"
    elif seconds >= 3600 and seconds < 86400:
        granularity = "hour"
    elif seconds == 86400:
        granularity = "day"
    elif seconds > 86400 and seconds <= 86400 * 7:
        granularity = "multi-day"
    elif seconds == 86400 * 7:
        granularity = "week"
    elif seconds > 86400 * 7 and seconds < 86400 * 28:
        granularity = "weekly-to-monthly"
    elif seconds >= 86400 * 28 and seconds <= 86400 * 31:
        granularity = "month"
    elif seconds > 86400 * 31 and seconds <= 86400 * 92:
        granularity = "quarter"
    else:
        granularity = "year-or-more"
    return granularity

# Detects whether the seasonal decomposition model should be
# 'additive' (constant amplitude) or 
# 'multiplicative'(amplitude grows with the trend)
def infer_seasonal_component_type(series: pd.Series, threshold: float) -> str:
    # Compute amplitude and mean
    amplitude = series.max() - series.min()
    mean_val = series.mean()
    # Edge case: zero-mean ‚Üí multiplicative impossible
    if mean_val == 0:
        return "additive"
    # Threshold for deciding multiplicative
    if amplitude / abs(mean_val) > threshold:
        return "multiplicative"
    else:
        return "additive"

#  Detects seasonality period automatically using the first significant ACF peak
def infer_period_from_acf(series: pd.Series, max_lag_ratio: float = 0.1):
    n = len(series)
    max_lag = max(5, int(n * max_lag_ratio))
    # Compute autocorrelation using FFT
    autocorr = acf(series, nlags=max_lag, fft=True, missing="drop")
    # Ignore lag 0
    autocorr[0] = 0
    # Find the highest correlation peak
    peak_lag = np.argmax(autocorr)
    # If the peak is too low ‚Üí no seasonality
    if autocorr[peak_lag] < 0.3:
        return None
    return peak_lag

#  Detects seasonality period automatically using Granularity (fallback when ACF fails or data is too noisy)
def infer_period_from_granularity(granularity: str):
    if granularity == "sub-second":
        return None
    if granularity == "second":
        return 60
    if granularity == "minute":
        return 60
    if granularity == "hour":
        return 24
    if granularity == "day":
        return 7  # Most common weekly cycle IF the series shows any seasonality
    if granularity == "multi-day":
        return 7
    if granularity == "week":
        return 52
    if granularity == "weekly-to-monthly":
        return 12
    if granularity == "month":
        return 12
    if granularity == "quarter":
        return 4
    return None # Yearly or undefined ‚Üí no decomposition

# 6) Evaluates how strong the seasonality is using:
# 1) Variance ratio: Var(seasonal) / Var(original)
# 2) ACF at the seasonal period
def assess_seasonality_strength(original: pd.Series, seasonal: pd.Series, period: int, acf_threshold: float, var_ratio: float):
    # Align indices and remove NaN values from the seasonal component
    valid_mask = seasonal.notna()
    original_valid = original[valid_mask]
    seasonal_valid = seasonal[valid_mask]
    # If there are not enough valid points ‚Üí cannot assess
    if len(original_valid) < max(10, period * 2):
        metrics = {
            "seasonal_var_ratio": np.nan,
            "acf_at_period": np.nan
        }
        return False, metrics
    # 1) Variance ratio
    total_var = np.var(original_valid)
    seasonal_var = np.var(seasonal_valid)
    if total_var == 0:
        seasonal_var_ratio = 0.0
    else:
        seasonal_var_ratio = seasonal_var / total_var
    # 2) ACF at seasonal period
    acf_values = acf(
        original_valid,
        nlags = period,
        fft = True,
        missing = "drop"
    )
    acf_at_period = acf_values[period]
    # 3) Decision rule
    strong_seasonality = ((seasonal_var_ratio >= var_ratio) and (acf_at_period >= acf_threshold))
    # 4) Metrics
    metrics = {
        "seasonal_var_ratio": seasonal_var_ratio,
        "acf_at_period": acf_at_period
    }
    return strong_seasonality, metrics

# Performs Dickey-Fuller test to determine if a series is stacionary or not
def test_stationarity(series):
    dftest = adfuller(series, autolag = "AIC")
    dfoutput = pd.Series(dftest[0:4], index = ["Test Statistic", "p-value", "#Lags Used", "Number of Observations Used"])
    for key,value in dftest[4].items():
        dfoutput["Critical Value (%s)"%key] = value
    return dfoutput

# Recursively differences the time-series until Dickey-Fuller test accepts stationarity (p < alpha)
def make_stationary_recursive(series, alpha: float = 0.05, max_diff: int = 5):
    current_series = series.copy()
    diff_count = 0
    while diff_count <= max_diff:
        test_results = test_stationarity(current_series)
        if test_results["p-value"] < alpha:
            return current_series, diff_count, test_results
        current_series = current_series.diff().dropna()
        diff_count += 1
    # If exceeded max_diff ‚Üí return last attempt
    return current_series, diff_count, test_results

# Function to get recommended lag based on granularity
def get_recommended_lag(granularity):
    if granularity == "sub-second" or granularity == "second":
        return 600   # Captures 10 minutes of autocorrelation
    if granularity == "minute":
        return 300   # Captures 5 hours
    if granularity == "hour":
        return 200   # Captures up to weekly cycles (168h)
    if granularity == "day":
        return 60    # Enough for weekly + monthly seasonality
    if granularity == "multi-day":
        return 60
    if granularity == "week":
        return 60    # Enough to detect annual cycle (52 weeks)
    if granularity == "month":
        return 48    # Captures 4 years of monthly pattern
    if granularity == "quarter":
        return 20
    return 10        # Yearly or undefined ‚Üí very small possible lags

# Function to get recommended cutoff for short-lag autocorrelation detection based on granularity
def get_short_lag_cutoff(granularity):
    if granularity == "sub-second" or granularity == "second":
        return 30   # 30 seconds of persistence
    if granularity == "minute":
        return 60   # 1 hour of short-term memory
    if granularity == "hour":
        return 24   # One day's worth of lags
    if granularity == "day":
        return 14   # Two weeks ‚Üí enough to detect trend
    if granularity == "multi-day":
        return 10
    if granularity == "week":
        return 8    # ~2 months of weekly persistence
    if granularity == "month":
        return 12   # 1 year of autocorrelation
    if granularity == "quarter":
        return 8
    return 3        # Very limited short-term interpretation

# Return a pandas-compatible frequency string based on granularity
def get_freq_from_granularity(granularity: str):
    freq_map = {
        "sub-second": None,             # No valid pandas freq for < 1 second
        "second": "S",                  # Second-level frequency
        "minute": "T",                  # Minute-level frequency
        "hour": "H",                    # Hourly frequency
        "day": "D",                     # Daily frequency
        "multi-day": "D",               # Best stable approximation
        "week": "W",                    # Weekly frequency
        "weekly-to-monthly": "W",       # Ambiguous ‚Üí weekly fits more stable
        "month": "M",                   # Monthly frequency
        "quarter": "Q",                 # Quarterly frequency
        "year-or-more": "A"             # Annual frequency
    }
    if granularity not in freq_map:
        return None
    return freq_map[granularity]

# Get seasonal period m for auto_arima
def get_auto_arima_m(period: int, seasonal_peaks: list):
    if period is None:
        return 1
    if seasonal_peaks is None:
        return 1
    if len(seasonal_peaks) == 0:
        return 1
    if period >= 2:
        return int(period)
    return 1

## COMMON INPUTS

In [3]:
# Plotting
num_values_to_plot = 40     # Max number of different values to plot (for CATEGORY_var)
num_bins = 100              # Num of bins (for NUMERIC_var plots)
figHeight_unit = 8
figWidth_unit = 12
plot_palette = "pastel"
plot_tick_font_size = 15
plot_label_font_size = 15
plot_text_font_size = 20
plot_title_font_size = 30

## STEP 0 - LOAD RAW DATAFRAME

In [4]:
# Raw dataset formatting
data_separator = ","
input_path = "../data/raw/internal-link.csv"
# Thresholds for dataset type proposal (NLP)
min_text_avg_length = 25      # Recommended: 20‚Äì30 chars ‚Üí typical minimum for real text
min_text_avg_words  = 3       # Recommended: >3 words ‚Üí avoids titles/labels
min_points_nlp = 70           # Min points to be considered NLP dataset (max point = 100)
# Thresholds for dataset type proposal (Time-Series)
ts_main_col_index = 0         # Index of the datetime column to be used as the primary time axis
min_rows_in_dataset = 75      # Recommended: > 75
max_numeric_var = 5           # More than 5 is odd for a time-series
freq_ratio_threshold = 0.7    # Recommended: > 0.7 (values from 0 to 1)
min_points_ts = 70            # Min points to be considered Time-Series dataset (max point = 100)

In [5]:
print("-------------------------------")
print("STEP 0) LOAD RAW DATAFRAME")
print("-------------------------------\n")

# Load raw DataFrame
df_raw = pd.read_csv(input_path, sep=data_separator)
log("DataFrame loaded successfully!", type="SUCCESS")

# ---------------------------------------------------------
# NLP CHECKING (probabilistic)
# ---------------------------------------------------------
log("NLP cheking:", custom_icon="üìù")
nlp_score = 0            # Final probability score (0‚Äì100)
# Evidence 1: dataset has at least one text-like column
object_cols = []
for col in df_raw.columns:
    if df_raw[col].dtype in ["object", "category"]:
        object_cols.append(col)
if len(object_cols) >= 1:
    nlp_score += 25
    log(f"Object-type columns found: {object_cols} (+25 points)", level=2, type="SUCCESS")
else:
    log("No object-type columns found", level=2, type="ERROR")
# Evidence 2: long text columns exist
text_cols = []
for col in object_cols:
    avg_len = df_raw[col].dropna().str.len().mean()
    if avg_len is not None and avg_len > min_text_avg_length:
        text_cols.append(col)
if len(text_cols) >= 1:
    nlp_score += 25
    log(f"Long text-like columns found: {text_cols} (+25 points)", level=2, type="SUCCESS")
else:
    log("No long text-like columns found", level=2, type="ERROR")
# Evidence 3: rich text structure (words per entry)
rich_text_cols = []
for col in text_cols:
    avg_words = df_raw[col].dropna().str.split().str.len().mean()
    if avg_words is not None and avg_words > min_text_avg_words:
        rich_text_cols.append(col)
if len(rich_text_cols) > 0:
    nlp_score += 25
    log(f"At least one column shows word-rich entries (+25 points)", level=2, type="SUCCESS")
else:
    log("No column shows word-rich entries", level=2, type="ERROR")
# Evidence 4: number of text columns realistic for NLP
if len(text_cols) == 1:
    nlp_score += 15
    log(f"There is only 1 long text-like column, it is typical for NLP (+15 points)", level=2, type="SUCCESS")
elif len(text_cols) > 1:
    nlp_score += 5
    log(f"There are more than 1 long text-like column, it could be possible for NLP (+5 points)", level=2, type="SUCCESS")
else:
    log("No long text-like columns found", level=2, type="ERROR")
# Evidence 5: proportion of non-empty text entries
if len(text_cols) >= 1:
    non_empty_ratio = df_raw[text_cols[0]].dropna().str.len().gt(10).mean()
    if non_empty_ratio >= 0.6:
        nlp_score += 10
        log(f"Majority of rows contain meaningful text (+10 points)", level=2, type="SUCCESS")
    else:
        log("Too many empty/short text entries", level=2, type="ERROR")
# Cap score at 100
nlp_score = min(nlp_score, 100)
log(f"Score to be a NLP Dataset: {nlp_score}/100 points", level=2, type="INFO", bold=True)

# ---------------------------------------------------------
# TIME-SERIES CHECKING (probabilistic)
# ---------------------------------------------------------
log("TIME-SERIES checking:", custom_icon="‚è±Ô∏è")
ts_score = 0                  # Final probability score (0‚Äì100)
ts_cols = []                  # List of detected datetime columns
# Evidence 1) Detect datetime columns
warnings.filterwarnings("ignore", message="Could not infer format", category=UserWarning) # Suppress warnings only related to datetime inference
for col in df_raw.columns:
    try:
        pd.to_datetime(df_raw[col], errors="raise")
        ts_cols.append(col)
    except:
        pass
# Case 1 ‚Üí no datetime columns
if len(ts_cols) == 0:
    log("No datetime columns detected", level=2, type="ERROR")
# Case 2 ‚Üí exactly one column
elif len(ts_cols) == 1:
    ts_score += 40
    log(f"Unique datetime column detected: {ts_cols[ts_main_col_index]} (+40 points)", level=2, type="SUCCESS")
# Case 3 ‚Üí multiple datetime columns
elif len(ts_cols) > 1:
    ts_score += 25
    log(f"Several datetime columns were detected: {ts_cols} (+25 points)", level=2, type="SUCCESS")
    # Check if all datetime columns share similar structure
    for col in ts_cols:
        try:
            dt_tmp = pd.to_datetime(df_raw[col], errors="coerce")
            missing_rate = dt_tmp.isna().mean()
            log(f"Column '{col}' parsed with missing rate: {missing_rate:.3f}", level=3, type="INFO")
        except:
            log(f"Column '{col}' failed advanced parsing", level=3, type="ERROR")
if len(ts_cols) > 0: # Evaluate time-series structure (only if datetime column exists)
    # Convert chosen datetime column
    serie_date_time_raw = pd.to_datetime(df_raw[ts_cols[ts_main_col_index]], errors="coerce")
    # Evidence 2) Chronologically sorted
    if serie_date_time_raw.is_monotonic_increasing:
        ts_score += 20
        log(f"Datetime column '{ts_cols[ts_main_col_index]}' is sorted (+20 points)", level=2, type="SUCCESS")
    else:
        log(f"Datetime column '{ts_cols[ts_main_col_index]}' is NOT sorted", level=2, type="ERROR")
    # Evidence 3) Detecting time-series frequency
    serie_date_time_diff_raw = serie_date_time_raw.diff().dropna()
    if len(serie_date_time_diff_raw) > 0:
        # Compute most common interval
        most_common_delta = serie_date_time_diff_raw.mode()[0]
        freq_ratio = (serie_date_time_diff_raw == most_common_delta).mean()
        if freq_ratio >= freq_ratio_threshold:
            ts_score += 20
            log(f"Regular frequency detected (+20 points)", level=2, type="SUCCESS")
            log(f"Frequency consistency ratio: {freq_ratio:.3f}", level=3, type="INFO")
        else:
            log("No regular frequency detected", level=2, type="ERROR")
    else:
        log("Not enough data to detect time-series frequency", level=2, type="ERROR")
    # Evidence 4) Numeric columns over time
    numeric_cols = df_raw.select_dtypes(include=["number"]).columns
    if 1 <= len(numeric_cols) <= max_numeric_var:
        ts_score += 10
        log(f"Numeric variables suitable for TS (+10 points)", level=2, type="SUCCESS")
    elif len(numeric_cols) < 1:
        log("There is not at least one numeric variable for time-series", level=2, type="ERROR")
    else:
        log("Too many numeric varaibles for time-series", level=2, type="ERROR")
    # Evidence 5) Dataset length
    if len(df_raw) >= min_rows_in_dataset:
        ts_score += 10
        log(f"Enough rows for time-series (+10 points)", level=2, type="SUCCESS")
    else:
        log("Dataset too short for time-series", level=2, type="ERROR")
# Cap score at 100
ts_score = min(ts_score, 100)
log(f"Score to be a TIME-SERIES Dataset: {ts_score}/100 points", level=2, type="INFO", bold=True)

# ---------------------------------------------------------
# DATASET TYPE PROPOSAL
# ---------------------------------------------------------
if ts_score >= nlp_score and ts_score >= min_points_ts:
    dataset_type_auto = "TIME-SERIES"
elif nlp_score >= ts_score and nlp_score >= min_points_nlp:
    dataset_type_auto = "NLP"
else:
    dataset_type_auto = "TABULAR"
print("\n")
log(f"Proposed dataset type: {dataset_type_auto}", type="INFO", bold=True)

-------------------------------
STEP 0) LOAD RAW DATAFRAME
-------------------------------

‚Ä¢ ‚úÖ DataFrame loaded successfully!
‚Ä¢ üìù NLP cheking:
   - ‚úÖ Object-type columns found: ['url'] (+25 points)
   - ‚úÖ Long text-like columns found: ['url'] (+25 points)
   - ‚ùå No column shows word-rich entries
   - ‚úÖ There is only 1 long text-like column, it is typical for NLP (+15 points)
   - ‚úÖ Majority of rows contain meaningful text (+10 points)
   - ‚ÑπÔ∏è [1mScore to be a NLP Dataset: 75/100 points[0m
‚Ä¢ ‚è±Ô∏è TIME-SERIES checking:
   - ‚ùå No datetime columns detected
   - ‚ÑπÔ∏è [1mScore to be a TIME-SERIES Dataset: 0/100 points[0m


‚Ä¢ ‚ÑπÔ∏è [1mProposed dataset type: NLP[0m


## STEP 1 - EXPLORE DATAFRAME

In [6]:
dataset_type = "NLP" # Confirm dataset type (TIME-SERIES, NLP or TABULAR)

In [7]:
# Copy previous step data
df_S1 = df_raw.copy()

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 1) EXPLORE DATAFRAME")
    print("-------------------------------\n")
    log("Dataset detected as NLP (long-text dataset)", custom_icon="üìù")
    # Print info
    log(f"Shape of the DataFrame: {df_S1.shape}", type="INFO")
    log("Content of the DataFrame:", type="INFO")
    display(df_S1.head(5))
    log("Sample of raw text entries:", type="INFO")
    display(df_S1[text_cols].head(5))
    # Print text length stats
    df_S1_lengths = df_S1[text_cols[0]].astype(str).str.len()
    log("Text length statistics:", type="INFO")
    log(f"Average lengths: {df_S1_lengths.mean():.1f} chars", level=2, custom_icon="üìä")
    log(f"Median lengths: {df_S1_lengths.median():.1f} chars", level=2, custom_icon="üìä")
    log(f"Max lengths: {df_S1_lengths.max():.1f} chars", level=2, custom_icon="üìä")
    # Compute word statistics
    df_S1_words = df_S1[text_cols[0]].astype(str).str.split().str.len()
    log("Word count statistics:", type="INFO")
    log(f"Average words: {df_S1_words.mean():.1f} chars", level=2, custom_icon="üìä")
    log(f"Median words: {df_S1_words.median():.1f} chars", level=2, custom_icon="üìä")
    log(f"Max words: {df_S1_words.max():.1f} chars", level=2, custom_icon="üìä")
# -------------------------------
# TIME-SERIES DATASET
# -------------------------------
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 1) EXPLORE DATAFRAME")
    print("-------------------------------\n")
    log("Dataset detected as TIME-SERIES", custom_icon="‚è±Ô∏è")
    # Print info
    log(f"Shape of the DataFrame: {df_S1.shape}", type="INFO")
    log("Content of the DataFrame:", type="INFO")
    display(df_S1.head(5))
    # Try to parse every column
    df_S1_ts_cols = []
    for col in df_S1.columns:
        try:
            pd.to_datetime(df_S1[col], errors="raise")
            df_S1_ts_cols.append(col)
        except:
            pass
    # Time column
    df_S1_ts_main_col = df_S1_ts_cols[ts_main_col_index]
    serie_date_time_S1 = pd.to_datetime(df_S1[df_S1_ts_main_col], errors="coerce")
    log("Time index information:", type="INFO")
    log(f"Detected time column: '{df_S1_ts_main_col}'", level=2, custom_icon="üìÖ")
    log(f"Start date: {serie_date_time_S1.min()}", level=2, custom_icon="üìÖ")
    log(f"End date: {serie_date_time_S1.max()}", level=2, custom_icon="üìÖ")
    log(f"Total duration: {serie_date_time_S1.max() - serie_date_time_S1.min()}", level=2, custom_icon="üìÖ")
    # Estimate frequency and granularity
    serie_date_time_diff_S1 = serie_date_time_S1.diff().dropna()
    if len(serie_date_time_diff_S1) > 0:
        df_S1_most_common_delta = serie_date_time_diff_S1.mode()[0] # Most common interval
        df_S1_smallest_delta = serie_date_time_diff_S1.min() # Minimal interval
        df_S1_freq_ratio = (serie_date_time_diff_S1 == df_S1_most_common_delta).mean()
        # Determine granularity
        df_S1_seconds = df_S1_most_common_delta.total_seconds()
        granularity = determine_granularity(df_S1_seconds)
        log(f"Most common interval: {df_S1_most_common_delta} (granularity: {granularity})", level=2, custom_icon="üìÖ")
        log(f"Smallest interval: {df_S1_smallest_delta}", level=2, custom_icon="üìÖ")
        log(f"Frequency consistency ratio: {df_S1_freq_ratio:.3f}", level=2, custom_icon="üìÖ")
    else:
        log("Not enough data points to estimate frequency", type="WARNING")
    # Missing or irregular timestamps
    missing_ratio = 1 - (serie_date_time_diff_S1 == most_common_delta).mean() if len(serie_date_time_diff_S1) > 0 else None
    if missing_ratio is not None and missing_ratio > 0.10:
        log("Irregular timestamps detected (missing or uneven intervals)", type="WARNING")
        log(f"Irregularity ratio: {missing_ratio:.2f}", level=2, custom_icon="‚ö†Ô∏è")
    # Numeric metrics
    numeric_cols = df_S1.select_dtypes(include=["number"]).columns
    log("Numeric metrics detected:", type="INFO")
    for col in numeric_cols:
        log(f"{col}", level=2, custom_icon="üìà")
    # Statistics for each metric
    log("Basic statistics per numeric variable:", type="INFO")
    display(df_S1[numeric_cols].describe().T)

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 1) EXPLORE DATAFRAME")
    print("-------------------------------\n")
    log("Dataset detected as TABULAR", custom_icon="üßÆ")
    # Print info
    log(f"Shape of the DataFrame: {df_S1.shape}", type="INFO")
    log("Content of the DataFrame:", type="INFO")
    display(df_S1.head(5))
    log("Info of the DataFrame (dataType and non-null values):", type="INFO")
    df_S1.info(verbose=True, show_counts=True)
    # Ordered info (fewest non-null first)
    ordered_info = pd.DataFrame({
        "Column": df_S1.columns,
        "Non-Null Count": df_S1.notnull().sum(),
        "Null Count": df_S1.isnull().sum(),
        "Dtype": df_S1.dtypes.astype(str)
    }).sort_values(by="Non-Null Count", ascending=True)
    log("Ordered info by number of non-null values:", type="INFO")
    display(ordered_info)
    # Count unique attributes (unsorted)
    df_S1_summary = pd.DataFrame({
        "Column": df_S1.columns,
        "Unique_Count": df_S1.nunique().values
    })
    log("DataFrame unique attributes (unsorted):", type="INFO")
    display(df_S1_summary)
    # Ordered summary (fewest unique first)
    df_S1_summary_ordered = df_S1_summary.sort_values(by="Unique_Count", ascending=True)
    log("Ordered unique attributes (fewest unique first):", type="INFO")
    display(df_S1_summary_ordered)
    # Automatic Warning for high-uniqueness columns
    unique_counts = df_S1.nunique()
    high_unique_cols = unique_counts[unique_counts == len(df_S1)].index.tolist()
    if len(high_unique_cols) > 0:
        log("Consider dropping the following columns for having UNIQUE values for EVERY row:", type="WARNING")
        for col in high_unique_cols:
            log(f"{col}", level=2, custom_icon="üóëÔ∏è")

-------------------------------
STEP 1) EXPLORE DATAFRAME
-------------------------------

‚Ä¢ üìù Dataset detected as NLP (long-text dataset)
‚Ä¢ ‚ÑπÔ∏è Shape of the DataFrame: (2999, 2)
‚Ä¢ ‚ÑπÔ∏è Content of the DataFrame:


Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


‚Ä¢ ‚ÑπÔ∏è Sample of raw text entries:


Unnamed: 0,url
0,https://briefingday.us8.list-manage.com/unsubs...
1,https://www.hvper.com/
2,https://briefingday.com/m/v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform
4,https://briefingday.com/fan


‚Ä¢ ‚ÑπÔ∏è Text length statistics:
   - üìä Average lengths: 73.5 chars
   - üìä Median lengths: 71.0 chars
   - üìä Max lengths: 269.0 chars
‚Ä¢ ‚ÑπÔ∏è Word count statistics:
   - üìä Average words: 1.0 chars
   - üìä Median words: 1.0 chars
   - üìä Max words: 1.0 chars


## STEP 2 - IDENTIFY TEXT COLUMN & METADATA

In [8]:
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TABULAR"
# -------------------------------
cols_to_drop = []  # List of column names to drop

In [9]:
# Copy previous step data
df_S2 = df_S1.copy()
df_S2_ts_main_col = df_S1_ts_main_col if dataset_type == "TIME-SERIES" else None

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 2) IDENTIFY TEXT COLUMN & METADATA")
    print("-------------------------------\n")
    # Identify main text column
    main_text_col_S2 = text_cols[0]
    log(f"Main text column selected: '{main_text_col_S2}'", type="INFO")
    # Check for additional object columns
    extra_object_cols = [c for c in df_S2.columns if c not in text_cols]
    if len(extra_object_cols) > 0:
        log("Additional non-text object columns detected:", type="INFO")
        for col in extra_object_cols:
            log(f"Column: '{col}'", level=2, custom_icon="üìÑ")
    else:
        log("No additional metadata columns detected", type="INFO")
    # Warn if more than one text-like column exists
    if len(text_cols) > 1:
        log("Multiple text-like columns detected. Consider selecting only one for preprocessing", type="WARNING")
        for col in text_cols:
            log(f"Column: '{col}'", level=2, custom_icon="üìù")

# -------------------------------
# TIME-SERIES DATASET
# -------------------------------
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 2) BUILD TIME-SERIES")
    print("-------------------------------\n")
    # Identify the temporal column
    if len(df_S2_ts_main_col) > 0:
        log(f"Detected temporal column: '{df_S2_ts_main_col}'", type="FOUND")
    else:
        log("No temporal column found ‚Üí cannot build time index", type="ERROR")
        df_S2_ts_main_col = None
    # Stop if no datetime column exists
    if df_S2_ts_main_col is None:
        raise ValueError("No datetime column found ‚Üí cannot build time index.")
    # Drop the column if still present as normal column
    if df_S2_ts_main_col in df_S2.columns:
        df_S2 = df_S2.drop(columns=[df_S2_ts_main_col])
    # Copy previous time-series
    serie_date_time_S2 = serie_date_time_S1.copy()
    # Sort by datetime just in case
    serie_date_time_S2 = serie_date_time_S2.sort_values()
    # Assign the datetime index
    df_S2.index = serie_date_time_S2
    # Make sure the index has a name
    df_S2.index.name = df_S2_ts_main_col
    # Show preview
    log(f"Indexed DataFrame by '{df_S2_ts_main_col}'", type="INFO")
    log("Preview of time-indexed DataFrame:", type="INFO")
    display(df_S2.head(5))
    # Extract numeric target series
    df_S2_numeric_cols = df_S2.select_dtypes(include=["number"]).columns
    if len(df_S2_numeric_cols) > 0:
        # Pick the first numeric column as the time-series
        df_S2_numeric_target_col = df_S2_numeric_cols[0]
        df_timeseries_S2 = df_S2[df_S2_numeric_target_col].dropna()
        log(f"Extracted target time-series '{df_S2_numeric_target_col}'", type="SUCCESS")
        display(df_timeseries_S2.head(5))
        fig, axis = plt.subplots(figsize = (figWidth_unit, figHeight_unit))
        sns.lineplot(data = df_timeseries_S2)
        plt.grid(True)
        plt.tight_layout()
        plt.show()
    else:
        log("No numeric metrics detected to extract as the main time-series", type="ERROR")

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 2) SELECT RELEVANT ATTRIBUTES")
    print("-------------------------------\n")
     # Drop non-relevant attributes
    df_S2=df_S2.drop(labels=cols_to_drop, axis =1)
    # Print results
    log("Non-Relevant attributes have been dropped", type="SUCCESS")
    log(f"Previous df's columns: {len(df_S1.columns)}", level=2, type="INFO")
    log(f"Current df's columns: {len(df_S2.columns)}", level=2, type="INFO")
    log(f"Final DataFrame shape: {df_S2.shape}", level=2, type="INFO")
    display(df_S2.head())
    # Count attributes
    df_S2_summary = pd.DataFrame({
        "Column": df_S2.columns,
        "Unique_Count": df_S2.nunique().values
    })
    log("Final DataFrame unique attributes:", level=2, type="INFO")
    display(df_S2_summary)

-------------------------------
STEP 2) IDENTIFY TEXT COLUMN & METADATA
-------------------------------

‚Ä¢ ‚ÑπÔ∏è Main text column selected: 'url'
‚Ä¢ ‚ÑπÔ∏è Additional non-text object columns detected:
   - üìÑ Column: 'is_spam'


## STEP 3 - REMOVE DUPLICATES

In [10]:
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TIME-SERIES"
# -------------------------------
# Threshold to determine seasonal component type (multiplicative or additive)
seasonal_component_type_threshold = 0.3 # if amplitude/abs(mean_val) > threshold -> "multiplicative"
# Thresholds to detect strong seasonal (both need to be higher than thresholds)
strong_seasonal_threshold_for_acf = 0.6 # ACF at the seasonal period
strong_seasonal_threshold_for_var_ratio = 0.5 # Variance ratio: Var(seasonal) / Var(original)

In [11]:
# Copy previous step data
df_S3 = df_S2.copy()
df_timeseries_S3 = df_timeseries_S2.copy() if dataset_type == "TIME-SERIES" else None
df_S3_freq_ratio = df_S1_freq_ratio if dataset_type == "TIME-SERIES" else None

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 3) REMOVE DUPLICATES")
    print("-------------------------------\n")
    num_duplicates=df_S3.duplicated().sum()
    if num_duplicates == 0:
        df_S3=df_S3
        log("Previous DataFrame does not contain duplicates:", type="SUCCESS")
        log(f"Previous DataFrame shape: {df_S2.shape}", level=2, type="INFO")
        log(f"Current DataFrame shape: {df_S3.shape}", level=2, type="INFO")
    else:
        df_S3_duplicates=df_S3[df_S3.duplicated()]
        df_S3=df_S3.drop_duplicates()
        log(f"Previous DataFrame contained " + str(num_duplicates) + " duplicates that have been dropped:", type="WARNING")
        log(f"Previous DataFrame shape: {df_S2.shape}", level=2, type="INFO")
        log(f"Current DataFrame shape: {df_S3.shape}", level=2, type="INFO")
        log("These are the dropped duplicates:", level=2, type="INFO")
        display(df_S3_duplicates)

# -------------------------------
# TIME-SERIES DATASET
# -------------------------------        
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 3) DECOMPOSING")
    print("-------------------------------\n")
    # 1) Validate series regularity before decomposition
    if df_S3_freq_ratio < freq_ratio_threshold:
        raise ValueError(f"Decomposition skipped due to Low frequency regularity (freq_ratio={df_S3_freq_ratio:.3f})")
    log(f"Timestamp regularity OK (freq_ratio={df_S3_freq_ratio:.3f})", level=1, type="SUCCESS")
    # 2) Detect period using ACF (primary robust method)
    period_acf = infer_period_from_acf(df_timeseries_S3)
    if period_acf is not None:
        period_S3 = period_acf
        log(f"Seasonality detected via ACF ‚Üí period = {period_acf}", level=1, type="SUCCESS")
    else:
        log("No significant seasonality found via ACF", level=1, type="WARNING")
        # 3) Fallback based on granularity (if ACF failed)
        period_fallback = infer_period_from_granularity(granularity)
        if period_fallback is None:
            raise ValueError("Unable to infer any valid period. Decomposition skipped because it is impossible to infer any valid period")
        log(f"Fallback period inferred: {period_fallback} (granularity={granularity})", level=1, type="SUCCESS")
        period_S3 = period_fallback
    # 4) Determine model: additive or multiplicative
    seasonal_component_type_S3 = infer_seasonal_component_type(df_timeseries_S3, seasonal_component_type_threshold)
    log(f"Type of seasonal component selected: {seasonal_component_type_S3}", level=1, type="SUCCESS")
    # 5) Perform decomposition
    try:
        decomposition_S3 = seasonal_decompose(x=df_timeseries_S3, model=seasonal_component_type_S3, period=period_S3)
        trend_S3 = decomposition_S3.trend
        seasonal_S3 = decomposition_S3.seasonal
        residual_S3 = decomposition_S3.resid
        log("Decomposition completed successfully", level=1, type="SUCCESS")
    except Exception as e:
        raise ValueError(f"Decomposition failed: {e}")
    # 6) Compute seasonality strength metrics
    strong_seasonality_S3, seasonality_metrics_S3 = assess_seasonality_strength(
        original        = df_timeseries_S3,
        seasonal        = seasonal_S3,
        period          = period_S3,
        acf_threshold   = strong_seasonal_threshold_for_acf,
        var_ratio       = strong_seasonal_threshold_for_var_ratio
    )
    if strong_seasonality_S3:
        log(f"Strong seasonality detected (var_ratio={seasonality_metrics_S3["seasonal_var_ratio"]:.3f}, acf={seasonality_metrics_S3["acf_at_period"]:.3f})", level = 1, type  = "SUCCESS")
    else:
        log(f"Weak or no seasonality (var_ratio={seasonality_metrics_S3["seasonal_var_ratio"]:.3f}, acf={seasonality_metrics_S3["acf_at_period"]:.3f})", level = 1, type  = "WARNING")
    # 7) Plot decomposition
    fig, axis = plt.subplots(figsize = (figWidth_unit, figHeight_unit))
    sns.lineplot(data = df_timeseries_S3, color = "blue", label = "Original Time-series")
    sns.lineplot(data = trend_S3, color = "orange", label = "Trend", linestyle = "--")
    sns.lineplot(data = residual_S3, color = "red", label = "Residual")
    sns.lineplot(data = seasonal_S3, color = "green", label = "Seasonal", linestyle = "--")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 3) REMOVE DUPLICATES")
    print("-------------------------------\n")
    num_duplicates=df_S3.duplicated().sum()
    if num_duplicates == 0:
        df_S3=df_S3
        log("Previous DataFrame does not contain duplicates:", type="SUCCESS")
        log(f"Previous DataFrame shape: {df_S2.shape}", level=2, type="INFO")
        log(f"Current DataFrame shape: {df_S3.shape}", level=2, type="INFO")
    else:
        df_S3_duplicates=df_S3[df_S3.duplicated()]
        df_S3=df_S3.drop_duplicates()
        log(f"Previous DataFrame contained " + str(num_duplicates) + " duplicates that have been dropped:", type="WARNING")
        log(f"Previous DataFrame shape: {df_S2.shape}", level=2, type="INFO")
        log(f"Current DataFrame shape: {df_S3.shape}", level=2, type="INFO")
        log("These are the dropped duplicates:", level=2, type="INFO")
        display(df_S3_duplicates)

-------------------------------
STEP 3) REMOVE DUPLICATES
-------------------------------

‚Ä¢ ‚ö†Ô∏è Previous DataFrame contained 630 duplicates that have been dropped:
   - ‚ÑπÔ∏è Previous DataFrame shape: (2999, 2)
   - ‚ÑπÔ∏è Current DataFrame shape: (2369, 2)
   - ‚ÑπÔ∏è These are the dropped duplicates:


Unnamed: 0,url,is_spam
60,https://briefingday.us8.list-manage.com/unsubs...,True
61,https://www.hvper.com/,True
62,https://briefingday.com/m/v4n3i4f3,True
64,https://briefingday.com/fan,True
113,https://briefingday.com/fan,True
...,...,...
2971,https://www.cnbc.com/2020/06/29/stock-market-f...,False
2972,https://thehustle.co/account/,True
2973,https://thehustle.co/,True
2979,https://www.bloomberg.com/tosv2.html,True


## STEP 4 - PROPOSE TARGET VARIABLE

In [12]:
# -------------------------------
# INPUTS NEEDED IF dataset_type = "NLP"
# -------------------------------
# # Thresholds for target variable proposal
max_num_classes = 50                                            # Recommended: 50 classes ‚Üí max accepted number of different classes
min_points_yvar = 75                                            # Minimum points to be considered as target candidate (max = 100 points)
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TIME-SERIES"
# -------------------------------
accepted_alpha_dickey_fuller = 0.05                             # Accepted error in the hypothesis
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TABULAR"
# -------------------------------
var_type_proposal_threshold = 25.00                             # [%] Under this percentage of unique values, the attribute is proposed as CATEGORIC
float_discrete_threshold = min(30, round(0.02 * len(df_S3)))    # Dynamic threshold for FLOAT to be considered DISCRETE

In [None]:
# Copy previous step data
df_S4 = df_S3.copy()
main_text_col_S4 = main_text_col_S2 if dataset_type == "NLP" else None
df_timeseries_S4 = df_timeseries_S3.copy() if dataset_type == "TIME-SERIES" else None

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 4) PROPOSE TARGET VARIABLE")
    print("-------------------------------\n")
    # Safety check: make sure we have at least one text column
    if len(text_cols) == 0:
        log("No text-like columns were previously detected: cannot propose a target variable for NLP", type="ERROR")
        y_var_auto = None
    else:
        # Main text column
        log(f"Main text column assumed as: '{main_text_col_S4}'", type="INFO")
        # List of non-text columns (potential candidates for target variable)
        non_text_cols = []
        for col in df_S4.columns:
            if col != main_text_col_S4:
                non_text_cols.append(col)
        if len(non_text_cols) == 0:
            log("No additional columns apart from the main text: this is likely an unsupervised NLP task", type="WARNING")
            y_var_auto = None
        else:
            log("Columns to be proposed as target variable:", type="INFO")
            candidate_scores = {}   # Store total score per candidate
            # Iterate through candidate columns
            for col in non_text_cols:
                log(f"Column '{col}':", level=2, type="FOUND")
                # Skip empty columns
                if len(df_S6[col].dropna()) == 0:
                    continue
                n_unique = df_S4[col].dropna().nunique()
                avg_len = df_S4[col].dropna().astype(str).str.len().mean()
                class_distribution = df_S4[col].dropna().value_counts(normalize=True)
                score = 0
                # Evidence 1) Reasonable number of unique classes
                if n_unique <= max_num_classes:
                    score += 35
                    log(f"Acceptable number of classes: {n_unique} (+35 points)", level=3, type="SUCCESS")
                else:
                    log(f"Too many unique classes: {n_unique}", level=3, type="ERROR")
                # Evidence 2) Short label length (typical for target columns)
                if avg_len <= 20:
                    score += 30
                    log(f"Short label length (avg {avg_len:.1f} chars) (+30 points)", level=3, type="SUCCESS")
                else:
                    log(f"Labels are too long on average (avg {avg_len:.1f} chars)", level=3, type="ERROR")
                # Evidence 3) Categorical / label-like data type
                if str(df_S4[col].dtype) in ["object", "category"]:
                    score += 20
                    log(f"Object/Categorical dtype (+20 points)", level=3, type="SUCCESS")
                else:
                    log(f"Non-categorical dtype ({str(df_S4[col].dtype)})", level=3, type="ERROR")
                # Evidence 4) Class balance
                if class_distribution.min() >= 0.01:
                    score += 15
                    log(f"Reasonable class balance (min class ratio ‚â• 1%) (+15 points)", level=3, type="SUCCESS")
                else:
                    log(f"Some classes have less than 1% of samples", level=3, type="ERROR")
                # Store results
                candidate_scores[col] = score
                log(f"Total score: {candidate_scores[col]}/100 points", level=3, custom_icon="üìù", bold=True)
            # Select best candidate
            if len(candidate_scores) == 0:
                print("\n")
                log("No valid non-text columns found to be used as target: this may be an unsupervised NLP task", type="WARNING", bold=True)
                y_var_auto = None
            else:
                y_var_auto = max(candidate_scores, key=candidate_scores.get)
                best_score = candidate_scores[y_var_auto]
                if best_score >= min_points_yvar:
                    print("\n")
                    log(f"Proposed target variable: '{y_var_auto}' ({best_score}/100 points)", type="INFO", bold=True)
                else:
                    print("\n")
                    log("No column reached the minimum score to be confidently proposed as target", type="WARNING", bold=True)
                    log(f"Best candidate was '{y_var_auto}' with {best_score}/100 points (below threshold {min_points_yvar})", type="INFO", bold=True)
                    y_var_auto = None
    
# -------------------------------
# TIME-SERIES DATASET
# -------------------------------        
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 4) STACIONARY ANAYSIS")
    print("-------------------------------\n")
    # Perform Dickey-Fuller test to check for stacionarity
    series_Dickey_Fuller_results = test_stationarity(series=df_timeseries_S4)

    if series_Dickey_Fuller_results["p-value"] >= accepted_alpha_dickey_fuller:
        log(f"Dickey-Fuller test's results:\n{series_Dickey_Fuller_results}\n", type="INFO")
        log(f"Hyphotesis rejected:Time-series IS NOT stationary, recursive differenciation is carried out\n", level=1, type="WARNING", bold = True)
        # Peform recursively Dickley-Fuller test until the time-series becomes stacionary
        df_stationary_timeseries_S4, diff_count_S4, series_recursive_Dickey_Fuller_results = make_stationary_recursive(
            series=df_timeseries_S4,
            alpha=accepted_alpha_dickey_fuller
            )
        log(f"Recursive differenciation ({diff_count_S4} step/s) -> Dickey-Fuller test's results:\n{series_recursive_Dickey_Fuller_results}\n", level=1, type="INFO")

        if series_recursive_Dickey_Fuller_results["p-value"] >= accepted_alpha_dickey_fuller:
            log(f"Time-series cannot become stationary (after {diff_count_S4} differencing step/s)", level=1, type="WARNING", bold = True)
        else:
            log(f"Time-series can become stationary (after {diff_count_S4} differencing step/s)", level=1, type="SUCCESS", bold = True)
    else:
        diff_count_S4= 0
        log(f"Hyphotesis accepted: time-series IS stationary, no need of differenciation", level=1, type="SUCCESS", bold = True)

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 4) CLASSIFY ATTRIBUTES AND TARGET VARIABLE")
    print("-------------------------------\n")
    # List of columns
    columns = df_S4.columns.tolist()
    # Iterate through columns
    category_var_auto = []
    numeric_var_auto = []
    for col in df_S4.columns:
        total_rows = len(df_S4)
        # Skip empty columns
        if total_rows == 0:
            continue
        unique_count =  df_S4[col].dropna().nunique()
        unique_ratio = unique_count / total_rows * 100
        col_dtype = str(df_S4[col].dtype)
        # Case 1: text-based columns
        if col_dtype in ["object", "category"]:
            category_var_auto.append(col)
            continue
        # Case 2: integer columns
        if col_dtype.startswith("int"):
            if unique_ratio <= var_type_proposal_threshold:
                category_var_auto.append(col)
            else:
                numeric_var_auto.append(col)
            continue
        # Case 3: float columns
        if col_dtype.startswith("float"):
            if unique_ratio <= var_type_proposal_threshold:
                category_var_auto.append(col)
            else:
                numeric_var_auto.append(col)
            continue
    # Print proposed Data Types
    log(f"Proposed CATEGORY Attributes: {category_var_auto}", type="INFO", bold=True)
    log(f"Proposed NUMERIC Attributes: {numeric_var_auto}", type="INFO", bold=True)


-------------------------------
STEP 4) PROPOSE TARGET VARIABLE
-------------------------------

‚Ä¢ ‚ÑπÔ∏è Main text column assumed as: 'url'
‚Ä¢ ‚ÑπÔ∏è Columns to be proposed as target variable:
   - üîç Column 'is_spam':
      ¬∑ ‚úÖ Acceptable number of classes: 2 (+35 points)
      ¬∑ ‚úÖ Short label length (avg 4.9 chars) (+30 points)
      ¬∑ ‚ùå Non-categorical dtype (bool)
      ¬∑ ‚úÖ Reasonable class balance (min class ratio ‚â• 1%) (+15 points)
      ¬∑ üìù [1mTotal score: 80/100 points[0m


‚Ä¢ ‚ÑπÔ∏è [1mProposed target variable: 'is_spam' (80/100 points)[0m


## STEP 5 - VARIABILITY ANAYSIS

In [14]:
y_var = "is_spam"                               # Confirm target variable
if_target_is_binary_treat_as_categoric = True   # Confirm treatment for target variable

# -------------------------------
# INPUTS NEEDED IF dataset_type = "TABULAR"
# -------------------------------
make_plots_UNIVARIANT = True                    # Draw plots?
y_var_highlighting_color = "green"              # Color to highlight target variable

In [15]:
# Copy previous step data
df_S5 = df_S4.copy()
residual_S5 = residual_S3 if dataset_type == "TIME-SERIES" else None
seasonal_component_type_S5 = seasonal_component_type_S3 if dataset_type == "TIME-SERIES" else None

# -------------------------------
# TARGET VARIABLE
# -------------------------------
print("-------------------------------")
print("TARGET VARIABLE")
print("-------------------------------")
# Basic stats
y_unique_values = df_S5[y_var].nunique()
y_unique_ratio = y_unique_values / len(df_S5) * 100
y_dtype_kind = df_S5[y_var].dtype.kind
y_var_type = None
requires_formating_nlp = False
# ----------------------------------------------
# BASE TYPE DETECTION (dtype-driven)
# ----------------------------------------------
# Case 1: text-based or boolean columns ‚Üí categorical
if y_dtype_kind in ['O','b']:
    y_var_type = "CATEGORIC"
# Case 2: numeric columns (int, uint, float) ‚Üí decide later by cardinality
elif y_dtype_kind in ['i','u','f']:
    y_var_type = "NUMERIC"
# ----------------------------------------------
# CARDINALITY RULE (only for numeric dtypes)
# ----------------------------------------------
if y_var_type == "NUMERIC":
    if y_unique_ratio <= var_type_proposal_threshold:
        y_var_type = "CATEGORIC"
# ----------------------------------------------
# OPTIONAL RULE ‚Äî binary numeric treated as categorical
# ----------------------------------------------
if y_var_type == "NUMERIC" and y_unique_values == 2 and if_target_is_binary_treat_as_categoric:
    y_var_type = "CATEGORIC"
# ----------------------------------------------
# ASSIGN SUBTYPE
# ----------------------------------------------
if y_var_type == "CATEGORIC":
    requires_formating_nlp = True
    if y_unique_values == 2:
        y_var_subtype = "BINARY"
    elif y_unique_values > 2:
        y_var_subtype = "MULTICLASS"
    else:
        y_var_subtype = "CONSTANT"
    log("Confirmed TARGET Variable: " + y_var + " -> " + y_var_type + " and " + y_var_subtype, type="INFO", bold=True)
else:
    if y_dtype_kind in ['i','u']:
        y_var_subtype = "DISCRETE"
    elif y_dtype_kind == 'f' and y_unique_values < float_discrete_threshold:
        y_var_subtype = "DISCRETE"
    else:
        y_var_subtype = "CONTINUOUS"
    log("Confirmed TARGET Variable: " + y_var + " -> NUMERIC and " + y_var_subtype, type="INFO", bold=True)
print("\n")

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 5) TRANSFORMATION OF TARGET VARIABLE")
    print("-------------------------------\n")
    # Transform y_var from CATEGORIC to NUMERIC format
    if requires_formating_nlp:
        # Instance encoder
        y_var_encoder = LabelEncoder()
        # Train encoder                       
        y_var_encoder.fit(df_S5[y_var])
        # Apply encoder                     
        df_S5[y_var] = y_var_encoder.transform(df_S5[y_var])    
        log(f"Target variable '{y_var}' has been transformed with LabelEncoder():", type="WARNING")
        for i in range(len(y_var_encoder.classes_)):
            log(f"{y_var_encoder.classes_[i]} -> {i}", level=2, type="INFO")
    else:
        log(f"Target variable '{y_var}' does not need to be transformed with LabelEncoder():", type="INFO")
        
# -------------------------------
# TIME-SERIES DATASET
# -------------------------------        
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 5) VARIABILITY ANALYSIS")
    print("-------------------------------\n")
    # Drop NaN values in residuals
    residual_S5 = residual_S5.dropna()
    # -------------------------------------------
    # RULE A: Check for visible trend in residuals
    # -------------------------------------------
    # Compute simple linear regression on residual vs time index
    x_index = np.arange(len(residual_S5))
    # Fit linear regression slope
    slope, intercept = np.polyfit(x_index, residual_S5.values, 1)
    # Compute residual standard deviation
    residual_std = np.std(residual_S5)
    # If residuals are almost constant ‚Üí no trend by definition
    if residual_std < 1e-8:
        log("Residual's slope analysis: Residuals are almost constant ‚Üí no visible trend (good).", type="SUCCESS")
    else:
        # Define a tolerance based on std and a minimum absolute tolerance
        if abs(slope) < max(residual_std * 0.01, 1e-6):
            log("Residual's slope analysis: No visible trend detected (good).", type="SUCCESS")
        else:
            log("Residual's slope analysis: Trend detected in residuals (bad).", type="WARNING", bold=True)
    # -------------------------------------------
    # RULE B: Check periodicity using ACF
    # -------------------------------------------
    # If residuals are almost constant ‚Üí ACF cannot detect periodicity, assume GOOD
    if residual_std < 1e-8:
        log("Residual's ACF analysis: Residuals are almost constant ‚Üí no periodicity possible (good).", level=1, type="SUCCESS")
    else:
        # Compute ACF up to 40 lags safely
        nlags = min(40, len(residual_S5) - 2)
        acf_res = acf(residual_S5, nlags=nlags, fft=True, missing="drop")
        # Detect highest non-zero lag correlation
        acf_res_no0 = acf_res[1:]
        max_acf_lag = np.argmax(np.abs(acf_res_no0)) + 1
        max_acf_value = acf_res[max_acf_lag]
        # Log ACF
        log(f"Residual ACF strongest lag={max_acf_lag}, value={max_acf_value:.3f}", level=1, type="INFO")
        # If ACF is NaN ‚Üí cannot infer periodicity ‚Üí assume GOOD
        if np.isnan(max_acf_value):
            log("Residual's ACF analysis: ACF cannot be computed reliably (likely constant residuals) ‚Üí no periodicity detected (good).", level=1, type="SUCCESS")
        # Periodicity rule: if max ACF < 0.3 ‚Üí no meaningful periodicity
        elif abs(max_acf_value) < 0.3:
            log("Residual's ACF analysis: No periodic patterns detected (good).", level=1, type="SUCCESS")
        else:
            log("Residual's ACF analysis: Residuals show periodic patterns (bad).", level=1, type="WARNING", bold=True)
    # -------------------------------------------
    # 4) RULE C: Check that residuals are centered
    # -------------------------------------------
    # Set theoretical center depending on decomposition model
    residual_center = 1.0 if seasonal_component_type_S5 == "multiplicative" else 0.0
    # Compute mean and standard deviation of cleaned residuals
    mean_res = residual_S5.mean()
    sd_res = residual_S5.std()
    # Define relative and absolute tolerances
    threshold_center = max(sd_res * 0.05, 1e-6)
    # Center rule: mean close to the expected center (0 additive, 1 multiplicative)
    if abs(mean_res - residual_center) < threshold_center:
        log(f"Residual's center analysis: Residuals centered around expected center ({residual_center}) (good).", level=1, type="SUCCESS")
    else:
        log(f"Residual's center analysis: Residuals not centered around expected center ({residual_center}) (bad).", level=1, type="WARNING")
    # -------------------------------------------
    # 5) RULE D: Check for randomness using Ljung‚ÄìBox test
    # -------------------------------------------
    # Center residuals for randomness tests
    residual_S5_centered = residual_S5 - residual_center
    # Compute standard deviation
    sd_res_centered = residual_S5_centered.std()
    # If residuals are almost constant ‚Üí cannot test randomness, but constant noise = GOOD
    if sd_res_centered < 1e-8:
        log("Residual's randomness analysis: Residuals are almost constant ‚Üí randomness cannot be tested, assumed random (good).", level=1, type="SUCCESS")
    else:
        # Define safe number of lags
        safe_lag = min(10, len(residual_S5_centered) - 2)
        # Compute Ljung‚ÄìBox p-value
        ljung_box_p = acorr_ljungbox(residual_S5_centered, lags=[safe_lag], return_df=True)["lb_pvalue"].iloc[0]
        # Log p-value
        log(f"Ljung‚ÄìBox p-value (lag {safe_lag}) = {ljung_box_p:.4f}", type="INFO")
        # Randomness rule: if p > 0.05 ‚Üí residuals behave like white noise
        if ljung_box_p > 0.05:
            log("Residual's randomness analysis: Residuals behave as random noise (good).", type="SUCCESS")
        else:
            log("Residual's randomness analysis: Residuals show correlation ‚Üí not white noise (bad).", type="WARNING")

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 5 - UNIVARIABLE ANALYSIS")
    print("-------------------------------\n")
    # Confirm attribute types
    category_att = []
    numeric_att = []
    for att in category_var_auto:
        if att != y_var:
            category_att.append(att)
    for att in numeric_var_auto:
        if att != y_var:
            numeric_att.append(att)
    # Checking CATEGORY attributes
    binary_att = []
    multiclass_att = []
    constant_att = []
    for att in category_att:
        att_unique_values = df_S4[att].nunique()
        if att_unique_values == 2:
            binary_att.append(att)
        elif att_unique_values > 2:
            multiclass_att.append(att)
        else:
            constant_att.append(att)
    # Checking NUMERIC attributes
    discrete_att = []
    continuos_att = []
    for att in numeric_att:
        att_dtype = df_S4[att].dtype.kind
        unique_count = df_S4[att].nunique()
        if att_dtype in ['i', 'u']:
            discrete_att.append(att)
        elif att_dtype == 'f' and unique_count < float_discrete_threshold:
            discrete_att.append(att)
        else:
            continuos_att.append(att)
    # Print results
    log("Confirmed CATEGORY Attributes:", type="INFO")
    log(f"BINARY: {binary_att}", level=2, type="INFO")
    log(f"MULTICLASS: {multiclass_att}", level=2, type="INFO")
    log(f"CONSTANT: {constant_att}", level=2, type="INFO")
    log("Confirmed NUMERIC Attributes:", type="INFO")
    log(f"DISCRETE: {discrete_att}", level=2, type="INFO")
    log(f"CONTINUOUS: {continuos_att}", level=2, type="INFO")

    if not make_plots_UNIVARIANT:
        log("UNIVARIABLE ANALYSIS is not printed, set make_plots_UNIVARIANT = True", type="WARNING")
    else:
        # -------------------------------------------
        # CATEGORY VARIABLES (including target if categorical)
        # -------------------------------------------
        print("üè∑Ô∏è CATEGORY VARIABLES")
        if not category_att and y_var_type == "NUMERIC":
            log("This type of plot is non applicable because there are not CATEGORIC variables in the DataFrame", type="WARNING")
        else:    
            var_to_plot = category_att.copy()
            if y_var_type == "CATEGORIC" and y_var not in var_to_plot:
                var_to_plot.insert(0, y_var)
            # Figure
            num_cols = 2
            num_rows = math.ceil(len(var_to_plot) / num_cols)
            fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(figWidth_unit * num_cols, figHeight_unit * num_rows))
            axes = axes.flatten()
            idx = 0
            for col in var_to_plot:
                unique_count = df_S5[col].nunique()
                if unique_count > num_values_to_plot:
                    order = df_S5[col].value_counts().head(num_values_to_plot).index
                else:
                    order = df_S5[col].value_counts().index
                # Countplot
                sns.countplot(ax=axes[idx], data=df_S5, x=col, hue=col, palette=plot_palette, order=order, legend=False)
                axes[idx].tick_params(axis='x', rotation=90, labelsize=plot_tick_font_size)
                axes[idx].set_xlabel("")
                # Highlight target
                target_box_style = dict(facecolor='none', edgecolor=y_var_highlighting_color, linewidth=5)
                target_title_style = dict(fontsize= plot_title_font_size, color=y_var_highlighting_color, fontweight='bold')
                if col == y_var:
                    axes[idx].set_title(col, **target_title_style)
                    axes[idx].add_patch(plt.Rectangle((0, 0), 1, 1, transform=axes[idx].transAxes, **target_box_style))
                else:
                    axes[idx].set_title(col, fontdict = {"fontsize": plot_title_font_size})
                # Add truncated info
                if unique_count > num_values_to_plot:
                    msg = f"There are {unique_count} values,\nbut only {num_values_to_plot} have been plotted"
                    axes[idx].text(0.5, 0.9, msg, transform=axes[idx].transAxes, fontsize=plot_text_font_size, color="red", ha="center", va="top", bbox=dict(facecolor="grey", alpha=0.25, edgecolor="red"))
                idx += 1
            # Hide unused axes
            for j in range(idx, len(axes)):
                axes[j].set_visible(False)
            plt.tight_layout()
            plt.show()
        # -------------------------------------------
        # NUMERIC VARIABLES (including target if numeric)
        # -------------------------------------------
        print("üî¢ NUMERIC VARIABLES")
        if not numeric_att and y_var_type == "CATEGORIC":
            log("This type of plot is non applicable because there are not NUMERIC variables in the DataFrame", type="WARNING")
        else: 
            var_to_plot = numeric_att.copy()
            if y_var_type == "NUMERIC" and y_var not in var_to_plot:
                var_to_plot.insert(0, y_var)
            # Figure
            num_cols = 2 
            num_rows = math.ceil(len(var_to_plot) / num_cols)
            fig, axes = plt.subplots(nrows=num_rows * 2, ncols=num_cols, figsize=(figWidth_unit * num_cols, figHeight_unit * num_rows), gridspec_kw={'height_ratios': [4, 0.5] * num_rows})
            var_idx = 0
            for row in range(num_rows):
                for col in range(num_cols):
                    if var_idx >= len(var_to_plot):
                        axes[row * 2, col].set_visible(False)
                        axes[row * 2 + 1, col].set_visible(False)
                        continue
                    colname = var_to_plot[var_idx]
                    # Histogram
                    sns.histplot(ax=axes[row * 2, col], data=df_S5, x=colname, bins=num_bins)
                    axes[row * 2, col].set_xlabel("")
                    # Boxplot
                    sns.boxplot(ax=axes[row * 2 + 1, col], data=df_S5, x=colname)
                    axes[row * 2 + 1, col].set_xlabel("")
                    # Highlight target
                    if colname == y_var:
                        axes[row * 2, col].set_title(colname, **target_title_style)
                        axes[row * 2 + 1, col].set_title(colname, **target_title_style)
                        axes[row * 2, col].add_patch(plt.Rectangle((0, 0), 1, 1, transform=axes[row * 2, col].transAxes, **target_box_style))
                        axes[row * 2 + 1, col].add_patch(plt.Rectangle((0, 0), 1, 1, transform=axes[row * 2 + 1, col].transAxes, **target_box_style))
                    else:
                        axes[row * 2, col].set_title(colname, fontdict = {"fontsize": plot_title_font_size})
                    var_idx += 1
            plt.tight_layout()
            plt.show()


-------------------------------
TARGET VARIABLE
-------------------------------
‚Ä¢ ‚ÑπÔ∏è [1mConfirmed TARGET Variable: is_spam -> CATEGORIC and BINARY[0m


-------------------------------
STEP 5) TRANSFORMATION OF TARGET VARIABLE
-------------------------------

‚Ä¢ ‚ö†Ô∏è Target variable 'is_spam' has been transformed with LabelEncoder():
   - ‚ÑπÔ∏è False -> 0
   - ‚ÑπÔ∏è True -> 1


## STEP 6 - TEXT PROCESSING

In [25]:
# -------------------------------
# INPUTS NEEDED IF dataset_type = "NLP"
# -------------------------------
# Default settings for text preprocessing:
lowercase_text = True                   # Convert all text to lowercase
remove_urls = False                     # Remove http/https/www links
remove_emails = False                   # Remove email addresses
remove_html_tags = False                # Remove HTML tags
remove_non_letters = False              # Remove digits, punctuation, symbols, emojis, etc.
remove_single_char_tokens = True       # Remove isolated 1-letter tokens (e.g., "a", "b")
reduce_whitespace = True                # Replace multiple spaces with a single one
tokenize_output = True                  # Output is a list of tokens after cleaning
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TABULAR"
# -------------------------------
make_plots_MULTIVARIANT_TARGET = True       # Draw plots?
make_plots_MULTIVARIANT_ATTRIBUTES = True   # Draw plots?
category_combi_att = ""                     # Combination attribute for multivariant analysis (must be a CATEGORIC attribute)

In [26]:
# Copy previous step data
df_S6 = df_S5.copy()
main_text_col_S6 = main_text_col_S4 if dataset_type == "NLP" else None
df_timeseries_S6 = df_timeseries_S3.copy() if dataset_type == "TIME-SERIES" else None
period_S6 = period_S3 if dataset_type == "TIME-SERIES" else None
suggested_d = diff_count_S4 if dataset_type == "TIME-SERIES" else None
df_stacionary_timeseries_for_pacf_S6 = df_stationary_timeseries_S4.copy() if dataset_type == "TIME-SERIES" else None

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 6) TEXT PROCESSING")
    print("-------------------------------\n")

    # 1) Store original text before any modification
    df_S6["original_url_tmp"] = df_S6[main_text_col_S6].astype(str)
    # 2) Compute clean tokens from preprocess_text
    df_S6["clean_url"] = df_S6["original_url_tmp"].apply(
        lambda txt: preprocess_text(
            txt,
            mode="auto",
            lowercase_text=lowercase_text,
            remove_urls=remove_urls,
            remove_emails=remove_emails,
            remove_html_tags=remove_html_tags,
            remove_non_letters=remove_non_letters,
            remove_single_char_tokens=remove_single_char_tokens,
            reduce_whitespace=reduce_whitespace,
            tokenize_output=tokenize_output
        )["output"])
    # 3) Compute processing mode (url / text)
    df_S6["processing_mode"] = df_S6["original_url_tmp"].apply(
        lambda txt: preprocess_text(
            txt,
            mode="auto",
            lowercase_text=lowercase_text,
            remove_urls=remove_urls,
            remove_emails=remove_emails,
            remove_html_tags=remove_html_tags,
            remove_non_letters=remove_non_letters,
            remove_single_char_tokens=remove_single_char_tokens,
            reduce_whitespace=reduce_whitespace,
            tokenize_output=tokenize_output
        )["mode_used"])
    # 4) Build the inspection DataFrame
    df_inspection_S6 = pd.DataFrame({
        "original_url": df_S6["original_url_tmp"],
        "processing_mode_used": df_S6["processing_mode"]
    })
    # 5) Clean df_S6 for downstream pipeline
    df_S6=df_S6.drop(labels=[main_text_col_S6], axis =1)
    df_S6 = df_S6.rename(columns={"clean_url": main_text_col_S6})
    df_S6 = df_S6[[main_text_col_S6, y_var]]  # keep only clean text + target

    log("Text preprocessing applied successfully:", type="SUCCESS")
    log("Processing mode used:", type="INFO")
    display(df_inspection_S6)
    log("Current DataFrame content:", type="INFO")
    display(df_S6)

# -------------------------------
# TIME-SERIES DATASET
# -------------------------------        
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 6) AUTOCORRELATION ANALYSIS")
    print("-------------------------------\n")
    # Set frequency for ARIMA based on granularity
    freq = get_freq_from_granularity(granularity)
    if freq is not None:
        try:
            df_timeseries_S6 = df_timeseries_S6.asfreq(freq)
            log(f"Applied pandas frequency '{freq}' to series for ARIMA modeling.", level=1, type="INFO")
        except Exception as e:
            log(f"Could not apply frequency '{freq}': {e}", level=1, type="WARNING")
    else:
        log(f"No valid pandas freq for granularity='{granularity}', ARIMA frequency skipped.", level=1, type="WARNING")
    # If series is almost constant ‚Üí no meaningful ACF/PACF
    if df_timeseries_S6.std() < 1e-8:
        log("ACF/PACF analysis: Time-series is almost constant ‚Üí no meaningful autocorrelation.", level=1, type="WARNING")
    else:
        # Get recommended lag based on granularity
        recommended_lag = get_recommended_lag(granularity)
        # Limit by available data
        safe_lag = min(recommended_lag, len(df_timeseries_S6) - 2)
        # Log chosen lag
        log(f"Used safe_lag = {safe_lag} (recommended={recommended_lag}, granularity={granularity})", level=1, type="INFO")
        # Compute confidence (95%) limit for significance bands
        conf_limit = 1.96 / np.sqrt(len(df_timeseries_S6))

        # -------------------------------------------
        # ACF BEHAVIOUR
        # -------------------------------------------
        # Compute numerical ACF values
        acf_vals = acf(df_timeseries_S6, nlags=safe_lag, fft=True, missing="drop")
        # Build list of significant ACF lags
        significant_acf_lags = []
        seasonal_peaks_S6 = []
        for lag in range(1, len(acf_vals)):
            val = acf_vals[lag]
            if abs(val) > conf_limit:
                significant_acf_lags.append(lag)
        if len(significant_acf_lags) == 0:
            log("ACF analysis: No significant autocorrelation detected ‚Üí series close to white noise.", level=1, type="INFO")
        else:
            log(f"ACF analysis: Significant autocorrelation at lags {significant_acf_lags}.", level=1, type="INFO")
            # Check short-lag ACF persistence (trend indicator)
            short_lags = []
            for lag in significant_acf_lags:
                if lag <= min(get_short_lag_cutoff(granularity), safe_lag):
                    short_lags.append(lag)
            if len(short_lags) > 0:
                log("ACF analysis: High short-lag autocorrelation ‚Üí possible trend or strong persistence.", level=1, type="INFO")
            # Check for seasonal multiples
            if (period_S6 is not None) and (period_S6 <= safe_lag):
                seasonal_peaks = []
                max_k = safe_lag // period_S6
                for k in range(1, max_k + 1):
                    lag = k * period_S6
                    if lag in significant_acf_lags:
                        seasonal_peaks.append(lag)
                seasonal_peaks_clean_print = []        
                for l in seasonal_peaks:
                    seasonal_peaks_clean_print.append(int(l))
                if len(seasonal_peaks) > 0:
                    seasonal_peaks_S6 = seasonal_peaks.copy()
                    log(f"ACF analysis: Significant seasonal peaks at lags {seasonal_peaks_clean_print} ‚Üí strong seasonality.", level=1, type="SUCCESS")

        # -------------------------------------------
        # PACF BEHAVIOUR
        # -------------------------------------------
        # Compute numerical PACF values
        pacf_vals = pacf(df_stacionary_timeseries_for_pacf_S6, nlags=safe_lag, method="ywm")
        # Build list of significant PACF lags
        significant_pacf_lags = []
        for lag in range(1, len(pacf_vals)):
            val = pacf_vals[lag]
            if abs(val) > conf_limit:
                significant_pacf_lags.append(lag)
        if len(significant_pacf_lags) == 0:
            suggested_p = 0
            log("PACF analysis: No significant partial autocorrelation detected.", level=1, type="INFO")
        else:
            log(f"PACF analysis: Significant PACF lags detected {significant_pacf_lags}.", level=1, type="INFO")
            # Keep non-seasonal PACF lags
            non_seasonal_pacf = []
            for lag in significant_pacf_lags:
                if (period_S6 is None) or (lag % period_S6 != 0):
                    non_seasonal_pacf.append(lag)
            if len(non_seasonal_pacf) > 0:
                suggested_p = non_seasonal_pacf[0]
                log(f"PACF analysis: First significant non-seasonal lag = {suggested_p} ‚Üí candidate AR order p ‚âà {suggested_p}.", level=1, type="INFO")
            else:
                suggested_p = 0
        # -------------------------------------------
        # MODEL ORDER SUGGESTION (AR / MA)
        # -------------------------------------------
        if len(significant_acf_lags) > 0:
            suggested_q = significant_acf_lags[0]
        else:
            suggested_q = 0
        # Start from suggested values
        candidate_orders = [(suggested_p, suggested_d, suggested_q)]
        # If suggested_p = 0, also try p = 1 as alternative
        if suggested_p == 0:
            candidate_orders.append((1, suggested_d, suggested_q))
        best_aic = np.inf
        best_order = None
        for (p_try, d_try, q_try) in candidate_orders:
            try:
                model_try = ARIMA(df_timeseries_S6, order=(p_try, d_try, q_try))
                result_try = model_try.fit()
                if result_try.aic < best_aic:
                    best_aic = result_try.aic
                    best_order = (p_try, d_try, q_try)
            except Exception as e:
                log(f"ARIMA({p_try},{d_try},{q_try}) could not be fitted: {e}", type="WARNING")

        if best_order is not None:
            suggested_p, suggested_d, suggested_q = best_order
            log(f"Final ARIMA order suggestion: (p,d,q)=({suggested_p},{suggested_d},{suggested_q}) after AIC-checked refinement (best AIC={best_aic:.2f}).", level=1, type="INFO", bold=True)

        # -------------------------------------------
        # PLOT ACF
        # -------------------------------------------
        fig_acf, ax_acf = plt.subplots(nrows=1, ncols=1, figsize=(2 * figWidth_unit, 1 * figHeight_unit))
        plot_acf(df_timeseries_S6, lags=safe_lag, ax=ax_acf)
        ax_acf.set_title(label="Autocorrelation Function (ACF)", fontsize=plot_title_font_size)
        ax_acf.set_xlabel(xlabel="Lag", fontsize=plot_label_font_size)
        ax_acf.set_ylabel(ylabel="Autocorrelation", fontsize=plot_label_font_size)
        ax_acf.tick_params(labelsize=plot_tick_font_size)
        ax_acf.grid(True, linestyle="dotted", linewidth=0.5, color="black")
        # Build manual legend to avoid statsmodels overwriting handles
        handles, labels = [], []
        # Highlight short-lag zone (trend indicator)
        short_cutoff = min(get_short_lag_cutoff(granularity), safe_lag)
        ax_acf.axvspan(
            xmin=1,
            xmax=short_cutoff,
            color="lightblue",
            alpha=0.35,
            label=f"Short-lag zone (‚â§ {short_cutoff})"
        )
        handles.append(plt.Line2D([0], [0], color="lightblue", linewidth=10, alpha=0.35))
        labels.append(f"Short-lag zone (‚â§ {short_cutoff})")
        # Mark seasonal period (if applicable)
        if (period_S6 is not None) and (period_S6 <= safe_lag):
            ax_acf.axvline(
                x=period_S6,
                color="orange",
                linestyle="-",
                linewidth=4.0,
                alpha=0.8,
                label=f"Seasonal period (lag={period_S6})"
            )
            handles.append(plt.Line2D([0], [0], color="orange", linewidth=4))
            labels.append(f"Seasonal period (lag={period_S6})")
        # Mark significant ACF lags (points)
        if len(significant_acf_lags) > 0:
            ax_acf.scatter(
                significant_acf_lags,
                [acf_vals[lag] for lag in significant_acf_lags],
                color="blue",
                s=100,
                label="Significant lags"
            )
            handles.append(plt.Line2D([0], [0], marker="o", color="blue", linestyle="None"))
            labels.append("Significant lags")
        # Info text box
        ax_acf.text(
            0.98, 0.02,
            f"safe_lag = {safe_lag}\n"
            f"granularity = {granularity}",
            ha='right',
            va='bottom',
            transform=ax_acf.transAxes,
            fontsize=plot_text_font_size,
            bbox=dict(boxstyle="round", fc="white", alpha=0.6)
        )
        # Show legend and plot
        ax_acf.legend(handles, labels, loc="upper right", fontsize=plot_text_font_size)
        plt.tight_layout()
        plt.show()

        # -------------------------------------------
        # PLOT PACF
        # -------------------------------------------
        fig_pacf, ax_pacf = plt.subplots(nrows=1, ncols=1, figsize=(2 * figWidth_unit, 1 * figHeight_unit))
        plot_pacf(df_timeseries_S6, lags=safe_lag, ax=ax_pacf)
        ax_pacf.set_title(label="Partial Autocorrelation Function (PACF)", fontsize=plot_title_font_size)
        ax_pacf.set_xlabel(xlabel="Lag", fontsize=plot_label_font_size)
        ax_pacf.set_ylabel(ylabel="Partial autocorrelation", fontsize=plot_label_font_size)
        ax_pacf.tick_params(labelsize=plot_tick_font_size)
        ax_pacf.grid(True, linestyle="dotted", linewidth=0.5, color="black")
        # Build manual legend to avoid statsmodels overwriting handles
        handles, labels = [], []
        # Mark significant PACF lags (points)
        if len(significant_pacf_lags) > 0:
            ax_pacf.scatter(
                significant_pacf_lags,
                [pacf_vals[lag] for lag in significant_pacf_lags],
                color="blue",
                s=100,
                label="Significant lags"
            )
            handles.append(plt.Line2D([0], [0], marker="o", color="blue", linestyle="None"))
            labels.append("Significant lags")
        # Info text box
        ax_pacf.text(
            0.98, 0.02,
            f"safe_lag = {safe_lag}",
            ha='right',
            va='bottom',
            transform=ax_pacf.transAxes,
            fontsize=plot_text_font_size,
            bbox=dict(boxstyle="round", fc="white", alpha=0.6)
        )
        # Show legend and plot
        ax_pacf.legend(handles, labels, loc="upper right", fontsize=plot_text_font_size)
        plt.tight_layout()
        plt.show()

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 6 - MULTIVARIANT ANALYSIS")
    print("-------------------------------\n")
    if not make_plots_MULTIVARIANT_TARGET:
        log("MULTIVARIANT ANALYSIS - ATTRIBUTES VS TARGET is not printed, set make_plots_MULTIVARIANT_TARGET = True", type="WARNING")
    else:
        # Validation
        if not category_att:
            log("There are no CATEGORIC attributes in the DataFrame", type="INFO")
        elif category_combi_att in category_att:
            log(F"Sucessfull verification: combination attribute {category_combi_att} is CATEGORIC", type="SUCCESS")
        elif category_combi_att in numeric_att:
            raise ValueError(f"‚ùå Combination attribute {category_combi_att} for multivariant analysis must be a CATEGORY attribute!")
        else:
            raise ValueError(f"‚ùå Combination attribute {category_combi_att} does not exist in the DataFrame")
        # -------------------------------------------
        # NUMERIC Attributes VS CATEGORY Target
        # -------------------------------------------
        print("\n üî¢ NUMERIC Attributes VS üè∑Ô∏è CATEGORY Target")
        if not numeric_att:
            log("This type of plot is non applicable because there are not NUMERIC attributes in the DataFrame", type="WARNING")
        elif y_var_type == "NUMERIC":
            log("This type of plot is non applicable because Target variable is NUMERIC", type="WARNING")
        else:
            # Set plotting variables
            var_to_plot=numeric_att
            # Figure
            num_cols = 2
            num_rows = math.ceil(len(var_to_plot) / num_cols) # Number of rows for the figure
            fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols*2, figsize=(figWidth_unit*num_cols, figHeight_unit*num_rows), gridspec_kw={'width_ratios': [3, 1] * num_cols})
            var_idx = 0
            for row in range(num_rows):
                for col in range(num_cols):
                    if var_idx >= len(var_to_plot):
                        # Hide unused subplots
                        axes[row, col * 2].set_visible(False)
                        axes[row , col* 2 + 1].set_visible(False)
                        continue
                    # Stripplot (left)
                    sns.stripplot(ax=axes[row, col*2], data=df_S6, x=y_var, y=var_to_plot[var_idx], hue=y_var, alpha = 0.3, legend=False)
                    axes[row, col * 2].set_ylabel(var_to_plot[var_idx],fontdict = {"fontsize": plot_label_font_size})
                    axes[row, col * 2].grid(True)
                    # Boxplot (right)
                    sns.boxplot(ax=axes[row, col*2 + 1], data=df_S6, x=y_var, y=var_to_plot[var_idx], hue=y_var, palette=plot_palette, legend=False)
                    axes[row, col * 2 + 1].set_ylabel("")
                    axes[row, col * 2 + 1].grid(True)
                    axes[row, col * 2 + 1].set_yticklabels([])
                    var_idx += 1
            # Adjust layout
            plt.tight_layout()
            plt.show()
        # -------------------------------------------
        # NUMERIC Attributes VS NUMERIC Target
        # -------------------------------------------    
        print("\n üî¢ NUMERIC Attributes VS üî¢ NUMERIC Target")
        if not numeric_att:
            log("This type of plot is non applicable because there are not NUMERIC attributes in the DataFrame", type="WARNING")
        elif y_var_type == "CATEGORIC":
            log("This type of plot is non applicable because Target variable is CATEGORIC", type="WARNING")
        else:
            # Set plotting variables
            var_to_plot=numeric_att
            # Figure
            num_cols = 2
            num_rows = math.ceil(len(var_to_plot) / num_cols) # Number of rows for the figure
            fig, axes = plt.subplots(nrows=num_rows*2, ncols=num_cols, figsize=(figWidth_unit*num_cols, figHeight_unit*num_rows), gridspec_kw={'height_ratios': [4, 1] * num_rows})
            var_idx = 0
            for row in range(num_rows):
                for col in range(num_cols):
                    if var_idx >= len(var_to_plot):
                        # Hide unused subplots
                        axes[row * 2, col].set_visible(False)
                        axes[row * 2 + 1, col].set_visible(False)
                        continue
                    # Regplot (top)
                    sns.regplot(ax=axes[row*2, col], data=df_S6, x=var_to_plot[var_idx], y=y_var, scatter_kws={'s': plot_text_font_size, 'alpha': 0.6}, line_kws={'color': 'red'})
                    # Boxplot (bottom)
                    sns.heatmap(ax=axes[row*2 + 1, col], data=df_S6[[var_to_plot[var_idx], y_var]].corr(), annot=True, fmt=".2f", cbar=False)
                    var_idx += 1
            # Adjust layout
            plt.tight_layout()
            plt.show()
        # -------------------------------------------
        # CATEGORY Attributes VS NUMERIC Target
        # -------------------------------------------  
        print("\n üè∑Ô∏è CATEGORY Attributes VS üî¢ NUMERIC Target")
        if not category_att:
            log("This type of plot is non applicable because there are not CATEGORIC attributes in the DataFrame", type="WARNING")
        elif y_var_type == "CATEGORIC":
            log("This type of plot is non applicable because Target variable is CATEGORIC", type="WARNING")
        else:        
            # Set plotting variables
            var_to_plot=category_att
            # Figure
            num_cols = 2
            num_rows = math.ceil(len(var_to_plot) / num_cols) # Number of rows for the figure
            fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(figWidth_unit*num_cols, figHeight_unit*num_rows))
            axes = axes.flatten()
            idx = 0
            for col in var_to_plot:
                # Count unique values
                unique_count = df_S6[col].nunique()
                # Limit the number of plotted categories if there are more than num_values_to_plot
                if unique_count > num_values_to_plot:
                    order = df_S6[col].value_counts().head(num_values_to_plot).index
                else:
                    order = df_S6[col].value_counts().index
                # Barplot
                sns.barplot(ax=axes[idx], data=df_S6, x=col, y=y_var, hue=category_combi_att, order=order)
                axes[idx].tick_params(axis='x', rotation=90, labelsize=10)
                # Add text box if truncated
                if unique_count > num_values_to_plot:
                    msg = "There are " + str(unique_count) + " different values,\nbut only " + str(num_values_to_plot) + " have been plotted"
                    axes[idx].text(0.5, 0.9, s=msg, transform=axes[idx].transAxes, fontsize=plot_text_font_size, color='red', ha='center', va='top', bbox=dict(facecolor='grey', alpha=0.5, edgecolor='red'))
                idx += 1
            # Turn off unused axes if there are any
            for j in range(idx, len(axes)):
                axes[j].set_visible(False)
            # Adjust layout and display
            plt.tight_layout()
            plt.show()
        # -------------------------------------------
        # CATEGORY Attributes VS Combined CATEGORY Target
        # -------------------------------------------  
        print("\n üè∑Ô∏è CATEGORY Attributes with üè∑Ô∏è Combined CATEGORY Target")
        if not category_att:
            log("This type of plot is non applicable because there are not CATEGORIC attributes in the DataFrame", type="WARNING")
        elif y_var_type == "NUMERIC":
            log("This type of plot is non applicable because Target variable is NUMERIC", type="WARNING")
        else:
            # Set plotting variables
            var_to_plot=category_att
            # Determine hue order dynamically
            hue_order = sorted(df_S6[y_var].dropna().unique().tolist()) 
            # Figure
            num_cols = 2
            num_rows = math.ceil(len(var_to_plot) / num_cols) # Number of rows for the figure
            fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(figWidth_unit*num_cols, figHeight_unit*num_rows))
            axes = axes.flatten()
            idx = 0
            for col in var_to_plot:
                # Count unique values
                unique_count = df_S6[col].nunique()
                # Limit the number of plotted categories if there are more than num_values_to_plot
                if unique_count > num_values_to_plot:
                    order = df_S6[col].value_counts().head(num_values_to_plot).index
                else:
                    order = df_S6[col].value_counts().index
                # Countplot
                sns.countplot(ax=axes[idx], data=df_S6, x=col, hue=y_var, hue_order=hue_order, palette=plot_palette, order=order)
                axes[idx].tick_params(axis='x', rotation=90, labelsize=plot_tick_font_size)
                # Add text box if truncated
                if unique_count > num_values_to_plot:
                    msg = "There are " + str(unique_count) + " different values,\nbut only " + str(num_values_to_plot) + " have been plotted"
                    axes[idx].text(0.5, 0.9, s=msg, transform=axes[idx].transAxes, fontsize=plot_text_font_size, color='red', ha='center', va='top', bbox=dict(facecolor='grey', alpha=0.25, edgecolor='red'))
                idx += 1
            # Turn off unused axes if there are any
            for j in range(idx, len(axes)):
                axes[j].set_visible(False)
            # Adjust layout and display
            plt.tight_layout()
            plt.show()
    if not make_plots_MULTIVARIANT_ATTRIBUTES:
        log("MULTIVARIANT ANALYSIS - ATTRIBUTES VS ATTRIBUTES is not printed, set make_plots_MULTIVARIANT_ATTRIBUTES = True", type="WARNING")
    else:
        # -------------------------------------------
        # NUMERIC Attributes VS NUMERIC Attributes
        # -------------------------------------------  
        print("\n üî¢ NUMERIC Attributes VS üî¢ NUMERIC Attributes")
        if not category_att:
            log("This type of plot is non applicable because there are not CATEGORIC attributes in the DataFrame", type="WARNING")
        else:
            # Set plotting variables
            var_to_plot = numeric_att
            # Figure
            num_cols = 2
            num_rows = len(var_to_plot) - 1  # Number of rows (one less than number of variables)
            fig, axes = plt.subplots(nrows = num_rows*2, ncols=len(var_to_plot) - 1, figsize=(figWidth_unit*(len(var_to_plot) - 1), figHeight_unit*num_rows), gridspec_kw={'height_ratios': [4, 1] * num_rows})
            axes = np.array(axes)
            # Track subplot usage
            for row in range(num_rows):
                n_cols = len(var_to_plot) - row - 1  # Decreasing number of columns each row
                for col in range(n_cols):
                    # Regplot (top)
                    sns.regplot(ax = axes[row*2, col], data = df_S6, x = var_to_plot[row + col + 1], y = var_to_plot[row], scatter_kws = {'s': plot_text_font_size, 'alpha': 0.6}, line_kws = {'color': 'red'})
                    axes[row * 2, col].set_xlabel(var_to_plot[row + col + 1], fontsize=20)
                    axes[row * 2, col].set_ylabel(var_to_plot[row], fontsize=20)
                    # Show Y label only for first plot in row
                    if col == 0:
                        axes[row * 2, col].set_ylabel(var_to_plot[row], fontsize=plot_label_font_size)
                    else:
                        axes[row * 2, col].set_ylabel("")
                    # Heatmap (bottom)
                    sns.heatmap(ax = axes[row*2 + 1, col], data = df_S6[[var_to_plot[row + col + 1], var_to_plot[row]]].corr(), annot=True, fmt = ".2f", cbar=False, annot_kws = {"size": 20})
                # Hide unused subplots on the right for this row
                for col in range(n_cols, len(var_to_plot) - 1):
                    axes[row * 2, col].set_visible(False)
                    axes[row * 2 + 1, col].set_visible(False)
            # Adjust layout and show
            plt.tight_layout()
            plt.show()

        print("\n üè∑Ô∏èüî¢ ALL Attributes VS üè∑Ô∏èüî¢ ALL Attributes")
        # Encode categorical variables using the Series.factorize() method
        for col in category_att:
            codes, uniques = df_S6[col].factorize()
            df_S6[col] = codes  # replace column with integer codes
        # Heatmap (CATEGORIC Attributes)
        if len(category_att) > 1:
            corr_cat = df_S6[category_att].corr()
            fig = plt.figure(figsize=(figWidth_unit, figHeight_unit))
            plt.title("ONLY CATEGORIC ATTRIBUTES", fontsize=plot_title_font_size, fontweight="bold")
            sns.heatmap(data=corr_cat, annot=True, vmin=-1, vmax=1, fmt=".2f", annot_kws={"size": plot_text_font_size})
            plt.tight_layout()
            plt.show()
        else:
            log("Not enough CATEGORIC attributes to plot a heatmap from a correlation matrix", type="WARNING")
        # Heatmap (NUMERIC Attributes)
        if len(numeric_att) > 1:
            corr_num = df_S6[numeric_att].corr()
            fig = plt.figure(figsize=(figWidth_unit, figHeight_unit))
            plt.title("ONLY NUMERIC ATTRIBUTES", fontsize=plot_title_font_size + 2, fontweight="bold")
            sns.heatmap(data=corr_num, annot=True, vmin=-1, vmax=1, fmt=".2f", annot_kws={"size": plot_text_font_size})
            plt.tight_layout()
            plt.show()
        else:
            log("Not enough NUMERIC attributes to plot a heatmap from a correlation matrix", type="WARNING")
        # Heatmap (CATEGORIC + NUMERIC Attributes)
        corr_matrix = df_S6[numeric_att + category_att].corr()
        corr_order = corr_matrix.mean().sort_values(ascending=False).index
        corr_matrix = corr_matrix.loc[corr_order, corr_order]
        fig = plt.figure(figsize=(2 * figWidth_unit, 2 * figHeight_unit))
        plt.title("CATEGORIC AND NUMERIC ATTRIBUTES", fontsize=plot_title_font_size + 2, fontweight="bold")
        sns.heatmap(data=corr_matrix, annot=True, vmin=-1, vmax=1, fmt=".2f", annot_kws={"size": plot_text_font_size})
        plt.tight_layout()
        plt.show()
        # Pairplot (sorted by correlation order)
        fig = plt.figure(figsize=(figWidth_unit, figHeight_unit))
        sns.pairplot(data=df_S6[corr_order])
        plt.show()

-------------------------------
STEP 6) TEXT PROCESSING
-------------------------------

‚Ä¢ ‚úÖ Text preprocessing applied successfully:
‚Ä¢ ‚ÑπÔ∏è Processing mode used:


Unnamed: 0,original_url,processing_mode_used
0,https://briefingday.us8.list-manage.com/unsubs...,url
1,https://www.hvper.com/,url
2,https://briefingday.com/m/v4n3i4f3,url
3,https://briefingday.com/n/20200618/m#commentform,url
4,https://briefingday.com/fan,url
...,...,...
2993,https://www.theverge.com/2020/6/29/21306889/di...,url
2994,https://www.smartcitiesworld.net/news/news/dee...,url
2996,https://techcrunch.com/2019/07/04/an-optimisti...,url
2997,https://www.technologyreview.com/2019/12/20/13...,url


‚Ä¢ ‚ÑπÔ∏è Current DataFrame content:


Unnamed: 0,url,is_spam
0,"[https, briefingday, us8, list, manage, com, u...",1
1,"[https, www, hvper, com]",1
2,"[https, briefingday, com, v4n3i4f3]",1
3,"[https, briefingday, com, 20200618, commentform]",0
4,"[https, briefingday, com, fan]",1
...,...,...
2993,"[https, www, theverge, com, 2020, 6, 29, 21306...",0
2994,"[https, www, smartcitiesworld, net, news, news...",0
2996,"[https, techcrunch, com, 2019, 07, 04, an, opt...",0
2997,"[https, www, technologyreview, com, 2019, 12, ...",0


## STEP 7 - TEXT LEMMATIZATION

In [None]:
# -------------------------------
# INPUTS NEEDED IF dataset_type = "TABULAR"
# -------------------------------
zero_to_nan = []                    # List of attributes where zero should be considered missing
filling_threshold = 5.0             # [%] If missing perc > filling_threshold ‚Üí fill values, otherwise drop rows
grouping_max_unique = 6             # Max number of unique values for a categorical attribute to be usable as keys for grouped median
make_missing_values_plots = True    # Make plots?

In [30]:
# Copy previous step data
df_S7 = df_S6.copy()

# -------------------------------
# NLP DATASET
# -------------------------------
if dataset_type == "NLP":
    print("-------------------------------")
    print("STEP 7) TEXT LEMMATIZATION")
    print("-------------------------------\n")
    # Remove missing values from target variable
    missing_y = df_S7[y_var].isnull().sum()
    if missing_y > 0:
        log(f"Target variable '{y_var}' contains {missing_y} missing values ‚Üí rows will be dropped", type="WARNING")
        df_S7 = df_S7.dropna(subset=[y_var])
    else:
        log(f"Target variable '{y_var}' has no missing values", type="SUCCESS")

# -------------------------------
# TABULAR DATASET
# -------------------------------
elif dataset_type == "TABULAR":
    print("-------------------------------")
    print("STEP 7 - MISSING VALUES")
    print("-------------------------------\n")
    # Remove missing values from target variable
    missing_y = df_S7[y_var].isnull().sum()
    if missing_y > 0:
        log(f"Target variable '{y_var}' contains {missing_y} missing values ‚Üí rows will be dropped", type="WARNING")
        df_S7 = df_S7.dropna(subset=[y_var])
    else:
        log(f"Target variable '{y_var}' has no missing values", type="SUCCESS")
    # Replace zeros by NaN for selected columns
    for col in zero_to_nan:
        if col in df_S7.columns:
            df_S7[col] = df_S7[col].replace(0, np.nan)
            log(f"Values equal to 0 in '{col}' have been replaced by NaN", type="WARNING")
    # Identify categorical variables usable as grouping keys for numeric imputation
    group_vars = []
    # Normal categorical attributes
    for col in category_att:
        if df_S7[col].nunique() <= grouping_max_unique:
            group_vars.append(col)
    # Add target as grouping variable if it is CATEGORICAL and has few unique values
    if y_var_type == "CATEGORIC":
        if df_S7[y_var].nunique() <= grouping_max_unique:
            group_vars.append(y_var)
            log(f"Target variable '{y_var}' added to grouping keys for numeric imputation", type="INFO")
    # Calculate missing percentages per column
    missing_pct = (df_S7.isnull().sum() / len(df_S7)) * 100
    missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
    if len(missing_pct) == 0:
        log("DataFrame has no missing values at all (excluding target variable already handled)", type="SUCCESS")
    else:
        # Process each column with missing values
        for col in missing_pct.index:
            pct = missing_pct[col]
            log(f"Column: {col} ‚Üí {pct:.2f}% missing", type="WARNING")
            # CASE 1: NUMERIC ATTRIBUTE
            if col in numeric_att:
                # CASE 1A: grouped median
                if pct > filling_threshold and len(group_vars) > 0:
                    medians = df_S7.groupby(group_vars)[col].median().reset_index()
                    medians = medians.rename(columns={col: f"median_{col}"})
                    df_S7 = pd.merge(df_S7, medians, on=group_vars, how="left")
                    df_S7[col] = df_S7[col].fillna(df_S7[f"median_{col}"])
                    df_S7 = df_S7.drop(columns=[f"median_{col}"])
                    log(f"FILLED missing numeric values in {col} using grouped median by {group_vars}", level=2, type="WARNING")
                # CASE 1B: global median
                elif pct > filling_threshold and len(group_vars) == 0:
                    df_S7[col] = df_S7[col].fillna(df_S7[col].median())
                    log(f"FILLED missing numeric values in {col} using global median (no grouping columns)", level=2, type="WARNING")
                # CASE 1C: drop rows
                elif pct <= filling_threshold:
                    df_S7 = df_S7.dropna(subset=[col])
                    log(f"DROPPED rows with missing values in {col} ({pct:.2f}% ‚â§ {filling_threshold}%)", level=2, type="WARNING")
            # CASE 2: CATEGORICAL ATTRIBUTE ‚Üí mode imputation
            elif col in category_att:
                mode_value = df_S7[col].mode().iloc[0]
                df_S7[col] = df_S7[col].fillna(mode_value)
                log(f"FILLED missing categorical values in {col} using mode (most frequent value)", level=2, type="WARNING")
            # CASE 3: unsupported
            else:
                df_S7 = df_S7.dropna(subset=[col])
                log(f"DROPPED rows for missing values in {col} because it has unsupported type for imputation", level=2, type="INFO")
    # Print results
    log(f"Previous df's rows: {len(df_S6)}", type="INFO")
    log(f"Current df's rows: {len(df_S7)}", type="INFO")
    log(f"Current df's shape: {df_S7.shape}", type="INFO")
    log(f"Remaining missing values per column:\n{df_S7.isnull().sum()}", type="INFO")
    if make_missing_values_plots:
        # BEFORE vs AFTER missing values handling
        print("\nüìä VISUAL CHECK - BEFORE vs AFTER missing values handling")
        df_S7_before = df_S6.copy()   # Before missing-value handling
        df_S7_after = df_S7.copy()    # After missing-value handling
        if not numeric_att:
            log("This type of plot is non applicable because there are not NUMERIC variables in the DataFrame", type="WARNING")
        else:
            var_to_plot = numeric_att.copy()
            if y_var_type == "NUMERIC" and y_var not in var_to_plot:
                var_to_plot.insert(0, y_var)
            # Figure
            num_cols = 2
            num_rows = len(var_to_plot)
            fig, axes = plt.subplots(nrows = num_rows * 2, ncols = num_cols, figsize = (figWidth_unit * num_cols, figHeight_unit * num_rows), gridspec_kw={'height_ratios': [4, 0.5] * num_rows})
            for i, colname in enumerate(var_to_plot):
                # Row indices for histogram and boxplot of this variable
                hist_row  = i * 2
                box_row   = i * 2 + 1
                # Common bins (syncronize BEFORE and AFTER)
                xmin = min(df_S7_before[colname].min(), df_S7_after[colname].min())
                xmax = max(df_S7_before[colname].max(), df_S7_after[colname].max())
                common_bins = np.linspace(xmin, xmax, num_bins + 1)
                # ================
                # BEFORE PLOTS
                # ================
                before_hist_ax = axes[hist_row, 0]
                before_box_ax  = axes[box_row, 0]
                sns.histplot(ax = before_hist_ax, data = df_S7_before, x = colname, bins = num_bins, color = "gray", alpha = 0.35)
                before_hist_ax.set_title(colname + " - BEFORE")
                before_hist_ax.set_xlabel("")
                sns.boxplot(ax = before_box_ax, data = df_S7_before, x = colname, color = "lightgray")
                before_box_ax.set_xlabel("")
                # Store BEFORE axis limits
                xlim_hist_before = before_hist_ax.get_xlim()
                ylim_hist_before = before_hist_ax.get_ylim()
                xlim_box_before  = before_box_ax.get_xlim()
                # ================
                # AFTER PLOTS
                # ================
                after_hist_ax = axes[hist_row, 1]
                after_box_ax  = axes[box_row, 1]
                sns.histplot( ax = after_hist_ax, data = df_S7_after, x = colname, bins = common_bins)
                after_hist_ax.set_title(colname + " - AFTER")
                after_hist_ax.set_xlabel("")
                sns.boxplot(ax = after_box_ax, data = df_S7_after, x = colname)
                after_box_ax.set_xlabel("")
                # Syncronize axes limits
                after_hist_ax.set_xlim(xlim_hist_before)
                after_hist_ax.set_ylim(ylim_hist_before)
                after_box_ax.set_xlim(xlim_box_before)
            plt.tight_layout()
            plt.show()

# -------------------------------
# TIME-SERIES DATASET
# -------------------------------        
elif dataset_type == "TIME-SERIES":
    print("-------------------------------")
    print("STEP 7 - MISSING VALUES")
    print("-------------------------------\n")
    # Remove missing values from target variable
    missing_y = df_S7[y_var].isnull().sum()
    if missing_y > 0:
        log(f"Target variable '{y_var}' contains {missing_y} missing values ‚Üí rows will be dropped", type="WARNING")
        df_S7 = df_S7.dropna(subset=[y_var])
    else:
        log(f"Target variable '{y_var}' has no missing values", type="SUCCESS")
    # Print results
    log(f"Previous df's rows: {len(df_S6)}", type="INFO")
    log(f"Current df's rows: {len(df_S7)}", type="INFO")
    log(f"Current df's shape: {df_S7.shape}", type="INFO")
    log(f"Remaining missing values per column:\n{df_S7.isnull().sum()}", type="INFO")

-------------------------------
STEP 7) TEXT LEMMATIZATION
-------------------------------

‚Ä¢ ‚úÖ Target variable 'is_spam' has no missing values


## STEP 16 - PREDICTION MODELS

In [18]:
prediction_horizon = 360

In [27]:
print("STEP 16) PREDICTION MODELS")

# Copy previous time-series
df_timeseries_S7 = df_timeseries_S6.copy()
period_S7 = period_S6
seasonal_peaks_S7 = seasonal_peaks_S6

# ======================================================
#  ARIMA MODEL WITH SUGGESTED VALUES
# ======================================================
# Instance ARIMA model
arima_model = ARIMA(endog = df_timeseries_S7, order = (suggested_p, suggested_d, suggested_q))
# Train ARIMA model
arima_result = arima_model.fit()
# Predict with trained ARIMA model
prediction_ARIMA = arima_result.predict(start  = len(df_timeseries_S7), end= len(df_timeseries_S7) + prediction_horizon)

# ======================================================
#  AUTO ARIMA
# ======================================================
# Decide seasonal behaviour for auto_arima
auto_arima_m = get_auto_arima_m(period=period_S7,seasonal_peaks = seasonal_peaks_S6)
if auto_arima_m > 1:
    auto_arima_seasonal = True
else:
    auto_arima_seasonal = False
# Instance AUTO ARIMA model
auto_arima_model = auto_arima(y=df_timeseries_S7, seasonal = auto_arima_seasonal, trace = False, m = auto_arima_m)
# Retrieve orders
auto_p, auto_d, auto_q = auto_arima_model.order
# Predict with trained AUTO ARIMA model
prediction_AUTO_ARIMA = auto_arima_model.predict(prediction_horizon)

# ======================================================
#  COMPARISON
# ======================================================
log(f"ARIMA (manual): selected order (p,d,q)=({suggested_p},{suggested_d},{suggested_q})", level=1, type="INFO")
log(f"AUTO-ARIMA: selected order (p,d,q)=({auto_p},{auto_d},{auto_q}) with seasonal={auto_arima_seasonal}, m={auto_arima_m}", level=1, type="INFO")

if (suggested_p == auto_p) and (suggested_d == auto_d) and (suggested_q == auto_q):
    log(f"ARIMA vs AUTO-ARIMA: Orders MATCH", level=1, type="SUCCESS", bold=True)
else:
    log(f"ARIMA vs AUTO-ARIMA: Orders DO NOT MATCH", level=1, type="WARNING", bold=True)

# ======================================================
#  FORECAST PLOT ‚Äî ARIMA
# ======================================================
fig_arima, ax_arima = plt.subplots(nrows=1, ncols=1, figsize=(2 * figWidth_unit, 1 * figHeight_unit))
# Plot original series
ax_arima.plot(df_timeseries_S7, label="Original Time-series")
# Plot ARIMA forecast
ax_arima.plot(prediction_ARIMA, label="Forecast (ARIMA)", color="red", linewidth=5, linestyle="dashed")
# Title, labels, ticks and legend
ax_arima.set_title(label=f"Forecast with ARIMA(p={suggested_p}, d={suggested_d}, q={suggested_q})", fontsize=plot_title_font_size)
ax_arima.set_xlabel(xlabel=df_timeseries_S7.index.name, fontsize=plot_label_font_size)
ax_arima.set_ylabel(ylabel=df_timeseries_S7.name, fontsize=plot_label_font_size)
ax_arima.tick_params(labelsize=plot_tick_font_size)
ax_arima.legend(fontsize=plot_text_font_size)
ax_arima.grid(True, linestyle="dotted", linewidth=0.5, color="black")
# Show plot
plt.tight_layout()
plt.show()

# ======================================================
#  FORECAST PLOT ‚Äî AUTO ARIMA
# ======================================================
fig_auto, ax_auto_arima = plt.subplots(nrows=1, ncols=1, figsize=(2 * figWidth_unit, 1 * figHeight_unit))
# Plot original series
ax_auto_arima.plot(df_timeseries_S7, label="Original Time-series")
# Plot AUTO-ARIMA forecast
ax_auto_arima.plot(prediction_AUTO_ARIMA, label="Forecast (AUTO-ARIMA)", color="green", linewidth=5, linestyle="dashed")
# Title, labels, ticks and legend
ax_auto_arima.set_title(label=f"Forecast with AUTO-ARIMA(p={auto_p}, d={auto_d}, q={auto_q})", fontsize=plot_title_font_size)
ax_auto_arima.set_xlabel(xlabel=df_timeseries_S7.index.name, fontsize=plot_label_font_size)
ax_auto_arima.set_ylabel(ylabel=df_timeseries_S7.name, fontsize=plot_label_font_size)
ax_auto_arima.tick_params(labelsize=plot_tick_font_size)
ax_auto_arima.legend(fontsize=plot_text_font_size)
ax_auto_arima.grid(True, linestyle="dotted", linewidth=0.5, color="black")
# Show plot
plt.tight_layout()
plt.show()

STEP 16) PREDICTION MODELS


AttributeError: 'NoneType' object has no attribute 'copy'

## STEP 8 - SAVE MODELS

In [None]:
models_output_path = "../models/"   # Folder where models will be saved
rev_to_use = 0

In [None]:
print("STEP 8) SAVE MODELS")

arima_filename = models_output_path + f"ARIMA_rev{rev_to_use}.sav"
dump(arima_result, open(arima_filename, "wb"))
log(f"Model saved: {arima_filename}", level=1, type="SUCCESS")

auto_arima_filename = models_output_path + f"AUTO_MANUAL_rev{rev_to_use}.sav"
dump(auto_arima_model, open(auto_arima_filename, "wb"))
log(f"Model saved: {auto_arima_filename}", level=1, type="SUCCESS")