In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import glob
from dotenv import load_dotenv, find_dotenv

# Load the .env file
load_dotenv(find_dotenv())

package_path = os.getenv('PACKAGE_PATH')
# package_path = '/home/dwna/projects/domain_class'
sys.path.append(package_path)


import pandas as pd
from openpyxl import load_workbook
import warnings
# warnings.filterwarnings('ignore')

from src.features.build_features import BuildFeatures


In [32]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [4]:

# Load data
data = pd.read_csv(package_path + '/data/processed/profiles/1/ver_1_len_1000_rate_0.01.cvs')


In [6]:
data.columns

Index(['col_name', 'yn', 'date_time', 'number', 'integer', 'bunho', 'email',
       'url', 'part_num', 'part_text', 'part_discriminator', 'part_mask',
       'part_minus', 'len_purity', 'value_nunique', 'value_distr', 'datatype',
       'BUNHO', 'NALJJA', 'MYEONG', 'JUSO', 'YEOBU', 'CODE', 'ID', 'SURYANG',
       'GEUMAEK', 'NAEYOUNG', 'YUL', 'ETC', 'domain'],
      dtype='object')

In [7]:
data.duplicated(subset=[ 'yn', 'date_time', 'number', 'integer', 'bunho', 'email',
       'url', 'part_num', 'part_text', 'part_discriminator', 'part_mask',
       'part_minus', 'len_purity', 'value_nunique', 'value_distr', 'datatype',
       'BUNHO', 'NALJJA', 'MYEONG', 'JUSO', 'YEOBU', 'CODE', 'ID', 'SURYANG',
       'GEUMAEK', 'NAEYOUNG', 'YUL', 'ETC' ]).sum()

190

In [11]:
data.domain.value_counts()

domain
번호    2971
날짜    2165
코드    1382
수량    1197
여부     784
금액      78
율       55
Name: count, dtype: int64

In [12]:
# For demonstration
# np.random.seed(0)
# data = pd.DataFrame({
#     'continuous1': np.random.rand(100),
#     'continuous2': np.random.rand(100),
#     'binary1': np.random.randint(0, 2, 100),
#     'binary2': np.random.randint(0, 2, 100),
#     'target': np.random.randint(0, 2, 100)
# })

data = data.dropna()
# Split data into features and target
X = data.drop(columns=['col_name', 'datatype', 'domain' ])
y = data['domain']

# Define continuous and binary columns
continuous_cols = [col for col in X.columns if col[0].islower()]
binary_cols = [col for col in X.columns if col[0].isupper()]


In [13]:
continuous_cols

['yn',
 'date_time',
 'number',
 'integer',
 'bunho',
 'email',
 'url',
 'part_num',
 'part_text',
 'part_discriminator',
 'part_mask',
 'part_minus',
 'len_purity',
 'value_nunique',
 'value_distr']

In [22]:
binary_cols

['BUNHO',
 'NALJJA',
 'MYEONG',
 'JUSO',
 'YEOBU',
 'CODE',
 'ID',
 'SURYANG',
 'GEUMAEK',
 'NAEYOUNG',
 'YUL',
 'ETC']

In [34]:

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_cols),
        # ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), binary_cols)
    ],
    remainder='passthrough'
)

# Define classifiers to compare
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)  # verbose=0 to prevent lots of output
}

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compare classifiers
for name, classifier in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)]
    )
    # Train the model on the training set
    # pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    # y_pred = pipeline.predict(X_test)

    # Calculate the accuracy on the test set
    # accuracy = accuracy_score(y_test, y_pred)
    # print(f'{name} Test Accuracy: {accuracy:.2f}')


    # Cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=5)
    print(cv_scores)
    print(f'{name} accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}')
# 
# analyze, tune, or select the model based on the output.

[0.79791546 0.69947887 0.78621089 0.80880649 0.87949015]
Logistic Regression accuracy: 0.79 +/- 0.06
[0.83902721 0.78401853 0.82850521 0.82792584 0.84994206]
Random Forest accuracy: 0.83 +/- 0.02
[0.80775912 0.72611465 0.80011587 0.82734647 0.90266512]
SVM accuracy: 0.81 +/- 0.06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2340
[LightGBM] [Info] Number of data points in the train set: 6905, number of used features: 21
[LightGBM] [Info] Start training from score -4.712867
[LightGBM] [Info] Start training from score -1.382969
[LightGBM] [Info] Start training from score -1.066827
[LightGBM] [Info] Start training from score -1.975153
[LightGBM] [Info] Start training from score -2.399055
[LightGBM] [Info] Start training from score -5.055811
[LightGBM] [Info] Start training from score -1.831496
[LightGBM] [Info] Auto-choosing row-wise multi-thread

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
name, classifier = 'lgbm', 

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)]
    )

In [52]:
pipeline.predict?

[0;31mSignature:[0m [0mpipeline[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mX[0m[0;34m,[0m [0;34m**[0m[0mpredict_params[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Transform the data, and apply `predict` with the final estimator.

Call `transform` of each transformer in the pipeline. The transformed
data are finally passed to the final estimator that calls `predict`
method. Only valid if the final estimator implements `predict`.

Parameters
----------
X : iterable
    Data to predict on. Must fulfill input requirements of first step
    of the pipeline.

**predict_params : dict of string -> object
    Parameters to the ``predict`` called at the end of all
    transformations in the pipeline. Note that while this may be
    used to return uncertainties from some models with return_std
    or return_cov, uncertainties that are generated by the
    transformations in the pipeline are not propagated to the
    final estimator.

    .. versionadded:: 0.20

Ret

In [29]:
[X[col].value_counts() for col in X[binary_cols]]  

[BUNHO
 0    7528
 1    1104
 Name: count, dtype: int64,
 NALJJA
 0    7707
 1     925
 Name: count, dtype: int64,
 MYEONG
 0    8631
 1       1
 Name: count, dtype: int64,
 JUSO
 0    8629
 1       3
 Name: count, dtype: int64,
 YEOBU
 0    7956
 1     676
 Name: count, dtype: int64,
 CODE
 0    7769
 1     863
 Name: count, dtype: int64,
 ID
 0    7477
 1    1155
 Name: count, dtype: int64,
 SURYANG
 0    8466
 1     166
 Name: count, dtype: int64,
 GEUMAEK
 0    8503
 1     129
 Name: count, dtype: int64,
 NAEYOUNG
 0    8627
 1       5
 Name: count, dtype: int64,
 YUL
 0    8592
 1      40
 Name: count, dtype: int64,
 ETC
 0    5067
 1    3565
 Name: count, dtype: int64]

In [1]:
from sklearn.metrics import classification_report

In [2]:
classification_report?

[0;31mSignature:[0m
[0mclassification_report[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0my_true[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_pred[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdigits[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_dict[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzero_division[0m[0;34m=[0m[0;34m'warn'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Build a text report showing the main classification metrics.

Read more in the :ref:`User Guide <classification_report>`.

Parameters
----------
y_true : 1d array-like, or label indicator arr

In [4]:
import mlflow


* 'schema_extra' has been renamed to 'json_schema_extra'


In [6]:
mlflow.log_artifacts?

[0;31mSignature:[0m
[0mmlflow[0m[0;34m.[0m[0mlog_artifacts[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mlocal_dir[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0martifact_path[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Log all the contents of a local directory as artifacts of the run. If no run is active,
this method will create a new active run.

:param local_dir: Path to the directory of files to write.
:param artifact_path: If provided, the directory in ``artifact_uri`` to write to.

.. testcode:: python
    :caption: Example

    import json
    import os
    import mlflow

    # Create some files to preserve as artifacts
    features = "rooms, zipcode, median_price, school_rating, transport"
    data = {"state": "TX", "Available": 25, "Type": "Detach