In [4]:
# connection
import os
from dotenv import load_dotenv
from shapely import Polygon, Point
import glob
import psycopg2
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
load_dotenv("..\..\.env")

True

In [7]:
#connect to production
HOST = os.getenv('host_production')
DB = os.getenv('DB_NAME')
PORT = 5432
USER= os.getenv('username_production')
PWD = os.getenv('password_production')
conn0 = psycopg2.connect(host=HOST,database=DB, user=USER, password=PWD)

DB_NAME_V3_POI=os.getenv("DB_NAME_V3_POI")
DB_USERNAME_V3_POI=os.getenv("DB_USERNAME_V3_POI")
DB_PASSWORD_V3_POI=os.getenv("DB_PASSWORD_V3_POI")
DB_HOST_V3=os.getenv("DB_HOST_V3")
conn_poi = psycopg2.connect(host=DB_HOST_V3,
                            database=DB_NAME_V3_POI, 
                            user=DB_USERNAME_V3_POI, 
                            password=DB_PASSWORD_V3_POI)



In [None]:
sql = """
SELECT
    a.poi_id,
    a.poi_name,
    b.brand_name,
    c.category_name,
    d.group_name,
    e.industry_name
FROM v3_tbl_poi a
JOIN v3_tbl_brand b on a.brand_id = b.brand_id
JOIN v3_tbl_category c on b.category_id = c.category_id
JOIN v3_tbl_group d on c.group_id = d.group_id
JOIN v3_tbl_industry e on d.industry_id = e.industry_id
JOIN v3_admin f on a.kode_desa = f.kode_desa
WHERE a.status = 'T' and b.status = 'T' and c.status = 'T' and d.status = 'T' and e.status = 'T'
    and f.status = 'T' and f.id_source = 6 and nama_kota = 'JAKARTA PUSAT'
"""
df = pd.read_sql(sql, conn_poi)

In [None]:
# Step 1: Load Data
# df = pd.read_csv('poi_data.csv')  # Replace with your actual dataset file
X = df['poi_name']  # Input: POI name
y = df[['brand_name','category_name', 'group_name', 'industry_name']]  # Targets

# Step 2: Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Define Model and Hyperparameter Tuning
base_model = RandomForestClassifier(random_state=42)

# Hyperparameter grid
param_grid = {
    'estimator__n_estimators': [50, 100, 150],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5, 10]
}

multi_output_model = MultiOutputClassifier(base_model)

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=multi_output_model,
    param_grid=param_grid,
    scoring='accuracy',  # You can use other metrics such as f1_macro
    cv=3,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1
)

# Train the model with hyperparameter tuning
print("Starting model tuning...")
grid_search.fit(X_train_tfidf, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Step 5: Model Testing
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the model for each target
for i, target in enumerate(['category_name', 'industry_name', 'group_name']):
    print(f"\nClassification Report for {target}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

# Step 6: Save the Best Model and TF-IDF Vectorizer
joblib.dump(best_model, 'poi_best_multioutput_model.pkl')
joblib.dump(vectorizer, 'poi_tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully!")

