In [1]:
import pandas as pd
import numpy as np

# ✅ Senin bilgisayarındaki gerçek dosya yolu:
df = pd.read_csv(r"C:\Users\goktu\workspace\datacops\crime_data_clean_azra.csv")

# ✅ Olası tarih kolonları
date_candidates = ["DATE OCC", "Date Rptd", "DATE_REPORTED", "DATE_OCC", "Occur Date"]

date_col = None
for col in date_candidates:
    if col in df.columns:
        date_col = col
        break

if date_col is None:
    raise ValueError("Tarih kolonu bulunamadı! Lütfen dosyadaki tarih sütununu bana yaz.")

# ✅ Tarih dönüşümü
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

# Tarihi olmayan satırları at
df = df.dropna(subset=[date_col]).sort_values(date_col).reset_index(drop=True)

# ✅ Yıl/Ay kolonları
df["year"]  = df[date_col].dt.year
df["month"] = df[date_col].dt.month

# ✅ 202310 gibi bir yıl-ay kodu
df["ym"] = df["year"]*100 + df["month"]

df.head()


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Weapon Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON,year,month,ym
0,200504125,2020-01-03,2020-01-01,1330,5,Harbor,587,2,624,BATTERY - SIMPLE ASSAULT,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AA,Adult Arrest,624.0,700 W 38TH ST,33.7096,-118.2916,2020,1,202001
1,200405039,2020-01-27,2020-01-01,1200,4,Hollenbeck,449,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),...,UNKNOWN,IC,Invest Cont,420.0,2700 POMEROY AV,34.0533,-118.1977,2020,1,202001
2,201504361,2020-01-02,2020-01-01,1900,15,N Hollywood,1525,1,330,BURGLARY FROM VEHICLE,...,UNKNOWN,IC,Invest Cont,330.0,11800 GILMORE ST,34.1875,-118.3899,2020,1,202001
3,200404021,2020-01-01,2020-01-01,2000,4,Hollenbeck,461,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,HAND GUN,IC,Invest Cont,230.0,100 S PECAN ST,34.0474,-118.2217,2020,1,202001
4,200600508,2020-01-04,2020-01-01,1600,6,Hollywood,646,2,624,BATTERY - SIMPLE ASSAULT,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,624.0,HOLLYWOOD,34.1016,-118.3267,2020,1,202001


In [2]:
# Son %15 dönemi test yapıyoruz
cut_ym = int(np.quantile(df["ym"], 0.85))

train_mask = df["ym"] <= cut_ym
test_mask  = df["ym"] >  cut_ym

df_train = df.loc[train_mask].copy()
df_test  = df.loc[test_mask].copy()

print("TRAIN boyutu:", len(df_train))
print("TEST boyutu :", len(df_test))
print("Kesim noktası (ym):", cut_ym)


TRAIN boyutu: 848024
TEST boyutu : 147207
Kesim noktası (ym): 202311


In [3]:
# Ölçekleme DIŞI bırakılacak kolonlar
do_not_scale = {
    date_col, "ym", "year", "month", 
    "Crm Cd", "Crm Cd Desc", "AREA",
    "Weapon Desc", "Premis Desc", "Vict Sex",
    "LAT", "LON", "Longitude", "Latitude",
    "count", "y_next", "hotspot"
}

# Tüm sayısal kolonları çek
num_cols = df_train.select_dtypes(include=[np.number]).columns

# Ölçeklenecek sayısallar
num_cols_to_scale = [c for c in num_cols if c not in do_not_scale]

num_cols_to_scale


['DR_NO',
 'TIME OCC',
 'Rpt Dist No',
 'Part 1-2',
 'Vict Age',
 'Premis Cd',
 'Weapon Used Cd',
 'Crm Cd 1']

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Train üzerinde FIT → Test üzerinde sadece TRANSFORM
Xtr = df_train[num_cols_to_scale].values
Xte = df_test[num_cols_to_scale].values

Xtr_scaled = scaler.fit_transform(Xtr)
Xte_scaled = scaler.transform(Xte)

# Ölçekli veriyi dataframe'e geri yerleştir
df_train_scaled = df_train.copy()
df_test_scaled  = df_test.copy()

df_train_scaled[num_cols_to_scale] = Xtr_scaled
df_test_scaled[num_cols_to_scale]  = Xte_scaled

df_train_scaled.head()


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Weapon Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON,year,month,ym
0,-1.467307,2020-01-03,2020-01-01,-0.011269,5,Harbor,-0.869397,1.198725,624,BATTERY - SIMPLE ASSAULT,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AA,Adult Arrest,0.598359,700 W 38TH ST,33.7096,-118.2916,2020,1,202001
1,-1.476255,2020-01-27,2020-01-01,-0.210139,4,Hollenbeck,-1.095741,-0.83422,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),...,UNKNOWN,IC,Invest Cont,-0.386279,2700 POMEROY AV,34.0533,-118.1977,2020,1,202001
2,-1.376981,2020-01-02,2020-01-01,0.860697,15,N Hollywood,0.669085,-0.83422,330,BURGLARY FROM VEHICLE,...,UNKNOWN,IC,Invest Cont,-0.820678,11800 GILMORE ST,34.1875,-118.3899,2020,1,202001
3,-1.476347,2020-01-01,2020-01-01,1.013674,4,Hollenbeck,-1.076059,-0.83422,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,HAND GUN,IC,Invest Cont,-1.303343,100 S PECAN ST,34.0474,-118.2217,2020,1,202001
4,-1.458603,2020-01-04,2020-01-01,0.401768,6,Hollywood,-0.772627,1.198725,624,BATTERY - SIMPLE ASSAULT,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,0.598359,HOLLYWOOD,34.1016,-118.3267,2020,1,202001


In [5]:
print("Train Means ~ 0:\n", df_train_scaled[num_cols_to_scale].mean().round(3).head())
print("\nTrain STDs ~ 1:\n", df_train_scaled[num_cols_to_scale].std().round(3).head())


Train Means ~ 0:
 DR_NO          0.0
TIME OCC      -0.0
Rpt Dist No   -0.0
Part 1-2       0.0
Vict Age      -0.0
dtype: float64

Train STDs ~ 1:
 DR_NO          1.0
TIME OCC       1.0
Rpt Dist No    1.0
Part 1-2       1.0
Vict Age       1.0
dtype: float64


In [6]:
import joblib, os
os.makedirs("artifacts", exist_ok=True)

joblib.dump(scaler, "artifacts/scaler.pkl")          # ölçekleyici
df_train_scaled.to_csv("train_scaled.csv", index=False)
df_test_scaled.to_csv("test_scaled.csv", index=False)

"Feature Scaling TAMAMLANDI ✅"


'Feature Scaling TAMAMLANDI ✅'