In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve, 
    auc, 
    roc_auc_score,
    precision_recall_curve, 
    average_precision_score
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
import xgboost as xgb 
import lightgbm as lgb 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from catboost import CatBoostClassifier


import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


import warnings
warnings.filterwarnings('ignore')

#Balance methods 
from imblearn.over_sampling import SMOTE, ADASYN 
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.combine import SMOTETomek, SMOTEENN

#Grafics_style 
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.family'] = 'DejaVu Sans'

In [None]:
df = pd.read_csv("./Dataset/Dataset.csv")
df.head()

In [None]:
# Information about data structure
df.dtypes

| Column | Значение |Meaning| Note|
|---------|----------|------------|------------|
| PIPE_NO | Серийный номер трубы | Pipe serial number | string |
| DV_R | Напряжение правой стороны | Right side voltage | int |
| DA_R | Ток правой стороны | Right side current | int |
| AV_R | Среднее напряжение | Medium voltage | int |
| AA_R | Средний ток | Average current | int |
| PM_R | Код режима сварки | Welding mode code | int |
| FIN_JGMT | FIN_JGMT=1: норма(normal) | FIN_JGMT=0: дефект(defect) | int |

In [None]:
df.describe()

In [None]:
# check for missing values
df.isnull().sum()

In [None]:
# Class distribution (imbalance)

class_counts = df['FIN_JGMT'].value_counts()
display(class_counts)
print(f"Class ratio (normal:defect): {class_counts[1]/class_counts[0]:.2f}:1")

#### <span style="color:green">Visualization of class distribution

In [None]:
plt.figure(figsize=[8, 5])
ax = sns.countplot(x='FIN_JGMT', data=df, palette=['red', 'green'])
plt.title("Class Distribution (0 - defect, 1 - normal)")
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks([0, 1], ['Defect (0)', 'normal (1)'])


total = len(df) 
for p in ax.patches: 
    height = p.get_height() 
    percentage = 100 * height / total 
    ax.annotate(f'{percentage:.1f}%', 
                (p.get_x() + p.get_width() / 2., height),
                ha = 'center',
                va = 'bottom', 
                fontsize=12
                )

plt.show()

### <span style='color:green'> Data Quality Analysis

In [None]:
# 1.1 Data Completeness
completeness = (1 - (df.isnull().sum() / len(df))) * 100
display(completeness)

#### <span style="color:red">As we can see, there are no empty values in our dataset.Our Dataset does not have missing values this was also checked earlier

In [None]:
#1.2 Data Unique 
uniqueness = {}
for col in df.columns: 
    uniqueness[col] = (df[col].nunique() / len(df)) * 100
display(pd.Series(uniqueness))

### <span style='color:red'>Outliers detection

#### with IQR

In [None]:
outliers_info = {} 
for col in ['DV_R', 'DA_R', 'AV_R', 'AA_R', 'PM_R']: 
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1 
    lower_bound = Q1 - 1.5 * IQR 
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    outliers_info[col] = {
        'total_outliers': len(outliers), 
        'percentage': (len(outliers) / len(df)) * 100, 
        'min_value': df[col].min(), 
        'max_value': df[col].max(), 
        'lower_bound': lower_bound, 
        'upper_bound': upper_bound
    }

In [None]:
# Info about outliers 

outliers_df = pd.DataFrame(outliers_info).T 
outliers_df['total_outliers'] = outliers_df['total_outliers'].astype(int)
outliers_df['percentage'] = outliers_df['percentage'].round(2)
display(outliers_df)

In [None]:
# Add gistogram 
vars_list=['DV_R', 'DA_R', 'AV_R', 'AA_R', 'PM_R'] 

plt.figure(figsize=[20, 4])
for i, col in enumerate(vars_list): 
    plt.subplot(1, 5, i + 1)
    n, bins, patches = plt.hist(df[col], bins = 10)
    plt.title(col)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

In [None]:
# Corr Analysis
corr = df[['DV_R', 'DA_R', 'AV_R', 'AA_R', 'PM_R', 'FIN_JGMT']].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr,annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
X = df[['DV_R', 'DA_R', 'AV_R', 'AA_R', 'PM_R']]
y = df['FIN_JGMT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=0, 
    stratify=y
)

X_train.shape, X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, QuantileTransformer 
from sklearn.pipeline import Pipeline

In [None]:
normalizers = {
    'StandardScaler': StandardScaler(), 
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(), 
    'Normalizer': Normalizer(), 
    'QuantileTranformer': QuantileTransformer(output_distribution='normal')
}

In [None]:
scaled_data={}

In [None]:
for name, normalizer in normalizers.items(): 
    X_train_normalized = normalizer.fit_transform(X_train)
    X_test_normalized = normalizer.transform(X_test)

    # Save normalize data 
    scaled_data[name] = {
        'X_train': X_train_normalized, 
        'X_test': X_test_normalized, 
        'normalizer': normalizer
    }
    print(f"Applied {name}")

In [None]:
sample_size = min(1000, len(X_train))
sample_indices = np.random.choice(len(X_train), sample_size, replace=False)

# create figure 
n_features = X_train.shape[1]
n_normalizers = len(normalizers) + 1 #for original data
fig_height = 4 * n_features
fig_width = 3 * n_normalizers
plt.figure(figsize=(fig_width, fig_height))

# visual
for i, feature_idx in enumerate(range(X_train.shape[1])): 
    feature_name = X.columns[feature_idx]

    plt.subplot(n_features, n_normalizers, i * n_normalizers + 1)
    sns.histplot(X_train.iloc[sample_indices, feature_idx], kde=True, color = 'navy')
    plt.title(f'Original: {feature_name}', fontsize=10)
    plt.xlabel('')
    # normolize data 
    for j, (name, data) in enumerate(scaled_data.items()): 
        plt.subplot(n_features, n_normalizers, i* n_normalizers + j + 2)
        sns.histplot(data['X_train'][sample_indices, feature_idx], kde=True, color='darkgreen')
        plt.title(f'{name}: {feature_name}', fontsize=10)
        plt.xlabel('')

plt.tight_layout()
plt.show()

In [None]:
print("\n Comparing normalization methods using baseline model:")

normalization_results = {} 

for name, data in scaled_data.items(): 
    model = LogisticRegression(random_state=0, max_iter=1000, C=1.0, solver='liblinear')
    model.fit(data['X_train'], y_train)

    y_pred = model.predict(data['X_test'])
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    normalization_results[name] = {
        'accuracy': accuracy, 
        'report': report, 
        'model': model
    }

    print(f"{name}: Accuracy = {accuracy:.4f}")

# Visualization of comparison of normalization methods 
plt.figure(figsize=[12, 6])
accuracies =[result['accuracy'] for result in normalization_results.values()]
method_names = list(normalization_results.keys())

sns.barplot(x=method_names, y=accuracies)
plt.title('Comparison of Normalization Methods')
plt.xlabel("Normalization Method")
plt.ylabel('Accuracy')
plt.xticks(rotation=45)

# Adding Precision Values
for i, acc in enumerate(accuracies): 
    plt.text (i, acc + 0.01, f'{acc:.4f}', ha='center')

plt.tight_layout()
plt.show()

In [None]:
best_method = max(normalization_results.items(), key = lambda x: x[1]['accuracy'])[0]
best_normalizer = scaled_data[best_method]['normalizer']
X_train_scaled = scaled_data[best_method]['X_train']
X_test_scaled = scaled_data[best_method]['X_test']

print(f"\nBest normalization method: {best_method} with accuracy {normalization_results[best_method]['accuracy']:.4f}")
print("Using this method for further analysis")

In [None]:
# Continue with StandardScaler
print(f'Training set size: {X_train.shape[0]} rows')
print(f"Test set size: {X_test.shape[0]} rows")
print(f"\nClass distribution in training set: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Class distribution in test set: {pd.Series(y_test).value_counts().to_dict()}")

### Baseline Model (Before balancing)

#### Training and evaluating baseline model on imbalanced data

In [None]:
minor_weight = 5
major_weight = 1

In [None]:
baseline_model = LogisticRegression(random_state=42, 
                                   max_iter=1000, 
                                   C=1.0,
                                    solver='liblinear', 
                                   class_weight={0: minor_weight, 1: major_weight})
print(f"Baseline model with class weigh: minor={minor_weight}, major={major_weight}")
baseline_model.fit(X_train_scaled, y_train)

In [None]:
y_pred_baseline = baseline_model.predict(X_test_scaled)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_report = classification_report(y_test, y_pred_baseline, output_dict=True)
baseline_conf_matrix = confusion_matrix(y_test, y_pred_baseline)

In [None]:
y_test_proba = baseline_model.predict_proba(X_test_scaled)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_proba)

In [None]:
# Trainset scoring
train_pred = baseline_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_pred)
train_report = classification_report(y_train, train_pred, output_dict=True)

In [None]:
# Trainset ROC-AUC
train_proba = baseline_model.predict_proba(X_train_scaled)[:, 1]
train_fpr, train_tpr, _ = roc_curve(y_train, train_proba)
train_roc_auc = auc(train_fpr, train_tpr)

In [None]:
print(f"\nTestset results:")
print(f"Accuracy: {baseline_accuracy:.4f}")
print(f"F1 (class 0): {baseline_report['0']['f1-score']:.4f}")
print(f"F1 (class 1): {baseline_report['1']['f1-score']:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC-AUC = {test_roc_auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC baseline model')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# result comparison Train and Test
results_comparison = pd.DataFrame({
    "Metrics" : ['Accuracy', 'F1 (class 0)', 'F1 (class 1)', 'ROC-AUC'],
    'Trainset' : [train_accuracy, train_report['0']['f1-score'], 
                 train_report['1']['f1-score'], train_roc_auc], 
    'Testset' : [baseline_accuracy, baseline_report['0']['f1-score'], 
                baseline_report['1']['f1-score'], test_roc_auc]
})

In [None]:
print("\nTrainset nd Testset results:")
print(results_comparison)

In [None]:
plt.figure(figsize=(10, 6))
results_comparison.set_index('Metrics').plot(kind='bar')
plt.title('Metrix Trainset and Testset')
plt.ylabel('Mean')
plt.ylim(0, 1.0)
plt.grid(axis='y')
plt.legend(title='')
plt.tight_layout()
plt.show()

In [None]:
# Visualization of confusion matrix for baseline model
plt.figure(figsize=(8, 6))
sns.heatmap(baseline_conf_matrix, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Defect (0)', 'Normal (1)'],
           yticklabels=['Defect (0)', 'Normal (1)'])
plt.title('Confusion Matrix: Baseline Model (No Balancing)')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

In [None]:
# PR curve for baseline model
precision_baseline, recall_baseline, _ = precision_recall_curve(y_test, y_proba_baseline)
pr_auc_baseline = average_precision_score(y_test, y_proba_baseline)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(recall_baseline, precision_baseline, label=f'Baseline PR (AP = {pr_auc_baseline:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR Curve for Baseline Model (No Balancing)')
plt.legend()
plt.show()

In [None]:
# Feature importance for baseline model
baseline_coef = baseline_model.coef_[0]
baseline_feature_importance = pd.DataFrame({'Feature': ['DV_R', 'DA_R', 'AV_R', 'AA_R', 'PM_R'], 
                                          'Coefficient': baseline_coef})
baseline_feature_importance = baseline_feature_importance.sort_values('Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=baseline_feature_importance)
plt.title('Baseline Model (Logistic Regression) Coefficients')
plt.axvline(x=0, color='gray', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
# Interpretation of baseline coefficients
print("\nInterpretation of baseline model coefficients:")
for feature, coef_value in zip(baseline_feature_importance['Feature'], baseline_feature_importance['Coefficient']):
    effect = "positively" if coef_value > 0 else "negatively"
    print(f"- {feature}: {coef_value:.4f} - {effect} affects the probability of normal welding quality")


In [None]:
# Store baseline results for later comparison
baseline_results = {
    'accuracy': baseline_accuracy,
    'report': baseline_report,
    'conf_matrix': baseline_conf_matrix,
    'fpr': fpr_baseline,
    'tpr': tpr_baseline,
    'roc_auc': roc_auc_baseline,
    'precision': precision_baseline,
    'recall': recall_baseline,
    'pr_auc': pr_auc_baseline,
    'model': baseline_model
}
