In [1]:
import os, pandas as pd

BASE_DIR = r"C:\Users\goktu\workspace\datacops"  # çıktıları burada tutuyoruz
TRAIN_SCALED_PATH = os.path.join(BASE_DIR, "train_scaled.csv")
TEST_SCALED_PATH  = os.path.join(BASE_DIR, "test_scaled.csv")

df_train_scaled = pd.read_csv(TRAIN_SCALED_PATH)
df_test_scaled  = pd.read_csv(TEST_SCALED_PATH)

df_train_scaled.shape, df_test_scaled.shape


((848024, 27), (147207, 27))

In [2]:
date_candidates = ["DATE OCC", "Date Rptd", "DATE_REPORTED", "DATE_OCC", "Occur Date"]
date_col = next((c for c in date_candidates if c in df_train_scaled.columns), None)

target_col = "Crm Cd"

exclude = {
    # Zaman yardımcıları
    "ym", "year", "month",
    
    # Kategorik Sızıntılar
    "Crm Cd Desc",
    "Premis Desc",
    "Weapon Desc",
    "Status Desc",
    "LOCATION",
    "Mocodes",
    "Status",
    
    # Sayısal Sızıntılar
    "Crm Cd 1",
    "Premis Cd",
    "Weapon Used Cd"
}

if date_col:
    exclude.add(date_col)

cat_cols = df_train_scaled.select_dtypes(include=["object"]).columns.tolist()

likely_cats = ["AREA", "AREA NAME", "Vict Sex"]
cat_cols += [c for c in likely_cats if c in df_train_scaled.columns and c not in cat_cols]

cat_cols = [c for c in cat_cols if c not in exclude and c != target_col]

print(f"Kategorik kolon sayısı (sızdıranlar hariç): {len(cat_cols)}")
print(cat_cols[:20])

Kategorik kolon sayısı (sızdıranlar hariç): 5
['Date Rptd', 'AREA NAME', 'Vict Sex', 'Vict Descent', 'AREA']


In [3]:
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
import numpy as np

# sklearn sürümüne göre parametre adı değişebiliyor
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

ohe.fit(df_train_scaled[cat_cols])

Xtr_cat = ohe.transform(df_train_scaled[cat_cols])
Xte_cat = ohe.transform(df_test_scaled[cat_cols])

print("OHE train/test:", Xtr_cat.shape, Xte_cat.shape)


OHE train/test: (848024, 1779) (147207, 1779)


In [4]:
from scipy.sparse import csr_matrix, hstack

target_col = "Crm Cd" 

num_cols_all = df_train_scaled.select_dtypes(include=["number"]).columns.tolist()

# Hem cat_cols, hem target_col, hem de exclude listesindekileri çıkar
numeric_cols_to_pass = [c for c in num_cols_all if c not in cat_cols and c != target_col and c not in exclude]

print(f"Sayısal özellik (feature) sayısı: {len(numeric_cols_to_pass)}")
print(f"Sayısal özelliklere örnek: {numeric_cols_to_pass[:10]}")

Xtr_num = csr_matrix(df_train_scaled[numeric_cols_to_pass].to_numpy())
Xte_num = csr_matrix(df_test_scaled[numeric_cols_to_pass].to_numpy())

X_train_ready = hstack([Xtr_num, Xtr_cat]).tocsr()
X_test_ready = hstack([Xte_num, Xte_cat]).tocsr()

num_names = list(numeric_cols_to_pass)
cat_names = ohe.get_feature_names_out(cat_cols).tolist()
feature_names = num_names + cat_names

X_train_ready.shape, X_test_ready.shape, len(feature_names)

Sayısal özellik (feature) sayısı: 7
Sayısal özelliklere örnek: ['DR_NO', 'TIME OCC', 'Rpt Dist No', 'Part 1-2', 'Vict Age', 'LAT', 'LON']


((848024, 1786), (147207, 1786), 1786)

In [5]:
import joblib
from scipy.sparse import save_npz

joblib.dump(ohe, os.path.join(BASE_DIR, "onehot_encoder.pkl"))
joblib.dump(feature_names, os.path.join(BASE_DIR, "feature_names.pkl"))
joblib.dump(numeric_cols_to_pass, os.path.join(BASE_DIR, "numeric_cols_to_pass.pkl"))
joblib.dump(cat_cols, os.path.join(BASE_DIR, "categorical_cols.pkl"))
save_npz(os.path.join(BASE_DIR, "X_train_ready.npz"), X_train_ready)
save_npz(os.path.join(BASE_DIR, "X_test_ready.npz"),  X_test_ready)

"Encoding TAMAMLANDI ✅"


'Encoding TAMAMLANDI ✅'

In [6]:
print("X_train_ready:", X_train_ready.shape)
print("X_test_ready :", X_test_ready.shape)
print("Özellik sayısı:", len(feature_names))
print("Örnek ilk 10 isim:", feature_names[:10])


X_train_ready: (848024, 1786)
X_test_ready : (147207, 1786)
Özellik sayısı: 1786
Örnek ilk 10 isim: ['DR_NO', 'TIME OCC', 'Rpt Dist No', 'Part 1-2', 'Vict Age', 'LAT', 'LON', 'Date Rptd_2020-01-01', 'Date Rptd_2020-01-02', 'Date Rptd_2020-01-03']
