In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import time
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
# Modeller
from sklearn.feature_selection import SelectPercentile, f_classif 
from sklearn.svm import LinearSVC 

# --- KLASÃ–R AYARLARI ---
BASE_DIR = r"C:\Users\goktu\workspace\datacops"          # Okunacak yer (Ana klasÃ¶r)
OUTPUT_DIR = os.path.join(BASE_DIR, "model_goktug")      # KAYDEDÄ°LECEK YER (Senin klasÃ¶rÃ¼n)

# KlasÃ¶rÃ¼n var olduÄŸundan emin olalÄ±m (Yoksa oluÅŸturur)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Okuma Yeri: {BASE_DIR}")
print(f"Kaydetme Yeri: {OUTPUT_DIR}")
print("Ayarlar tamamlandÄ±. ")

Okuma Yeri: C:\Users\goktu\workspace\datacops
Kaydetme Yeri: C:\Users\goktu\workspace\datacops\model_goktug
Ayarlar tamamlandÄ±. 


In [2]:
print("Veriler yÃ¼kleniyor...")
X_train_full = load_npz(os.path.join(BASE_DIR, "X_train_ready.npz"))
X_test_ready = load_npz(os.path.join(BASE_DIR, "X_test_ready.npz"))
feature_names = joblib.load(os.path.join(BASE_DIR, "feature_names.pkl"))

df_train = pd.read_csv(os.path.join(BASE_DIR, "train_scaled.csv"))
df_test = pd.read_csv(os.path.join(BASE_DIR, "test_scaled.csv"))
y_train_full = df_train["Crm Cd"].astype(str)
y_test = df_test["Crm Cd"].astype(str)

print(f"Veri HazÄ±r. Boyut: {X_train_full.shape}")

Veriler yÃ¼kleniyor...
Veri HazÄ±r. Boyut: (848024, 1786)


In [3]:
SAMPLE_SIZE = 500000 

print(f"HÄ±z iÃ§in {SAMPLE_SIZE} satÄ±rlÄ±k Ã¶rneklem alÄ±nÄ±yor...")

try:
    X_train_bireysel, _, y_train_bireysel, _ = train_test_split(
        X_train_full, y_train_full, train_size=SAMPLE_SIZE, 
        stratify=y_train_full, random_state=42
    )
except ValueError:
    print("! Stratify olmadÄ±, rastgele alÄ±nÄ±yor...")
    X_train_bireysel, _, y_train_bireysel, _ = train_test_split(
        X_train_full, y_train_full, train_size=SAMPLE_SIZE, random_state=42
    )

print("Ã–rneklem hazÄ±r.")

HÄ±z iÃ§in 500000 satÄ±rlÄ±k Ã¶rneklem alÄ±nÄ±yor...
Ã–rneklem hazÄ±r.


In [4]:
print("1. Ã–zellik SeÃ§imi YapÄ±lÄ±yor (Tree-Based / Embedded Method)...")

# SeÃ§im yapmak iÃ§in basit bir Karar AÄŸacÄ± kullanÄ±yoruz
# Bu, modelin iÃ§ine bakarak seÃ§im yapan "Embedded" bir yÃ¶ntemdir.
selection_model = DecisionTreeClassifier(max_depth=10, random_state=42)

# max_features=200: En Ã¶nemli 200 Ã¶zelliÄŸi seÃ§ diyoruz
selector_2 = SelectFromModel(selection_model, max_features=200, threshold=-np.inf)

# SeÃ§imi kÃ¼Ã§Ã¼k veri (sampling) Ã¼zerinde Ã¶ÄŸren
selector_2.fit(X_train_bireysel, y_train_bireysel)

# Verileri dÃ¶nÃ¼ÅŸtÃ¼r (Sadece seÃ§ilenleri al)
X_train_sel_2 = selector_2.transform(X_train_bireysel)
X_test_sel_2 = selector_2.transform(X_test_ready)

print(f"SeÃ§im Bitti âœ…")
print(f"SeÃ§ilen Ã–zellik SayÄ±sÄ±: {X_train_sel_2.shape[1]}")

1. Ã–zellik SeÃ§imi YapÄ±lÄ±yor (Tree-Based / Embedded Method)...
SeÃ§im Bitti âœ…
SeÃ§ilen Ã–zellik SayÄ±sÄ±: 200


In [5]:
print("\n2. Model EÄŸitiliyor (Ridge Classifier)...")

# Ridge, L2 dÃ¼zenlileÅŸtirme kullanan hÄ±zlÄ± bir sÄ±nÄ±flandÄ±rÄ±cÄ±dÄ±r.
clf_2 = RidgeClassifier(
    alpha=1.0, 
    random_state=42
)

start_time = time.time()
clf_2.fit(X_train_sel_2, y_train_bireysel)
end_time = time.time()

print(f"EÄŸitim TamamlandÄ± ({end_time - start_time:.2f} saniye) âœ…")


2. Model EÄŸitiliyor (Ridge Classifier)...
EÄŸitim TamamlandÄ± (152.32 saniye) âœ…


In [6]:
print("\n3. Tahmin YapÄ±lÄ±yor ve Dosya OluÅŸturuluyor...")

y_pred_2 = clf_2.predict(X_test_sel_2)

# Skorunu gÃ¶relim (Model 1 ile karÅŸÄ±laÅŸtÄ±rmak iÃ§in)
acc_2 = accuracy_score(y_test, y_pred_2)
print(f"\n 2. MODEL (RIDGE) SKORU: {acc_2 * 100:.2f}%")

# 2. DosyayÄ± Kaydet
submission_df_2 = pd.DataFrame({
    "DR_NO": df_test["DR_NO"],
    "Crm Cd": y_pred_2
})

filename_2 = "submission_goktug_model2.csv"
# DosyayÄ± senin klasÃ¶rÃ¼ne (OUTPUT_DIR) kaydediyoruz
save_path_2 = os.path.join(OUTPUT_DIR, filename_2)

submission_df_2.to_csv(save_path_2, index=False)

print(f"2. Dosya BaÅŸarÄ±yla Kaydedildi: {filename_2}")
print(f"Konum: {save_path_2}")


3. Tahmin YapÄ±lÄ±yor ve Dosya OluÅŸturuluyor...

ğŸ”¥ 2. MODEL (RIDGE) SKORU: 33.74%
âœ… 2. Dosya BaÅŸarÄ±yla Kaydedildi: submission_goktug_model2.csv
ğŸ“‚ Konum: C:\Users\goktu\workspace\datacops\model_goktug\submission_goktug_model2.csv
