In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv("crime_data_clean_azra.csv")

print("Orijinal satır sayısı:", df.shape[0])

# Model 1'deki ile benzer boyutta çalıştım, bilgisayarım kaldırmadığı için.
df = df.sample(n=80000, random_state=42)

print("Kullanılan satır sayısı (sample):", df.shape[0])
df.head()

Orijinal satır sayısı: 995231
Kullanılan satır sayısı (sample): 80000


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Premis Cd,Premis Desc,Weapon Used Cd,Weapon Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON
75319,201408407,2020-03-17,2020-03-16,700,14,Pacific,1416,1,310,BURGLARY,...,501.0,SINGLE FAMILY DWELLING,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,310.0,3300 MOORE ST,34.01,-118.4497
655426,230125961,2023-12-09,2023-12-09,1155,1,Central,138,1,330,BURGLARY FROM VEHICLE,...,122.0,"VEHICLE, PASSENGER/TRUCK",-1.0,UNKNOWN,IC,Invest Cont,330.0,600 E 3RD ST,34.0461,-118.2395
652607,230808162,2023-04-11,2023-04-11,2051,8,West LA,897,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",-1.0,UNKNOWN,IC,Invest Cont,745.0,3300 S BEVERLY DR,34.0314,-118.3988
80531,201311531,2020-05-27,2020-05-27,1800,13,Newton,1375,2,624,BATTERY - SIMPLE ASSAULT,...,104.0,DRIVEWAY,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,624.0,1400 E 55TH ST,33.993,-118.2499
858965,230114610,2023-06-21,2023-06-21,800,1,Central,192,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,122.0,"VEHICLE, PASSENGER/TRUCK",-1.0,UNKNOWN,IC,Invest Cont,745.0,200 VENICE BL,34.0347,-118.2646


In [3]:
# Hücre 3: Tarih ve saat kolonlarından ek feature üretimi

# --- Tarih kolonu arıyoruz ---
date_col_candidates = ['DATE OCC', 'Date', 'OCC_DATE']
date_col = None

for c in date_col_candidates:
    if c in df.columns:
        date_col = c
        break

if date_col is not None:
    df['DATE_OCC_DT'] = pd.to_datetime(df[date_col], errors='coerce')
    df['year'] = df['DATE_OCC_DT'].dt.year
    df['month'] = df['DATE_OCC_DT'].dt.month
    df['dayofweek'] = df['DATE_OCC_DT'].dt.dayofweek  # 0 = Pazartesi
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
else:
    print("Uyarı: Tarih kolonu bulunamadı, year/month/dayofweek/is_weekend üretilmedi.")

# --- Saat kolonu arıyoruz ---
time_col_candidates = ['TIME OCC', 'Time', 'OCC_TIME']
time_col = None

for c in time_col_candidates:
    if c in df.columns:
        time_col = c
        break

if time_col is not None:
    time_num = pd.to_numeric(df[time_col], errors='coerce')
    df['hour'] = (time_num // 100).astype('Int64')  # nullable integer
else:
    print("Uyarı: Saat kolonu bulunamadı, hour üretilmedi.")

df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Crm Cd 1,LOCATION,LAT,LON,DATE_OCC_DT,year,month,dayofweek,is_weekend,hour
75319,201408407,2020-03-17,2020-03-16,700,14,Pacific,1416,1,310,BURGLARY,...,310.0,3300 MOORE ST,34.01,-118.4497,2020-03-16,2020,3,0,0,7
655426,230125961,2023-12-09,2023-12-09,1155,1,Central,138,1,330,BURGLARY FROM VEHICLE,...,330.0,600 E 3RD ST,34.0461,-118.2395,2023-12-09,2023,12,5,1,11
652607,230808162,2023-04-11,2023-04-11,2051,8,West LA,897,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,745.0,3300 S BEVERLY DR,34.0314,-118.3988,2023-04-11,2023,4,1,0,20
80531,201311531,2020-05-27,2020-05-27,1800,13,Newton,1375,2,624,BATTERY - SIMPLE ASSAULT,...,624.0,1400 E 55TH ST,33.993,-118.2499,2020-05-27,2020,5,2,0,18
858965,230114610,2023-06-21,2023-06-21,800,1,Central,192,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,745.0,200 VENICE BL,34.0347,-118.2646,2023-06-21,2023,6,2,0,8


In [4]:
# Hücre 4: gereksiz kolonları atıyoruz
drop_cols = [
    'DR_NO',
    'DATE_OCC_DT',
    'LOCATION',
    'Crm Cd Desc',
    'Weapon Desc',
    'Status Desc',
    'AREA NAME',
    'Date Rptd',
    'Mocodes'
]

for c in drop_cols:
    if c in df.columns:
        df = df.drop(columns=[c])

print("Drop sonrası kolon sayısı:", df.shape[1])
df.head()

Drop sonrası kolon sayısı: 21


Unnamed: 0,DATE OCC,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Vict Sex,Vict Descent,Premis Cd,...,Weapon Used Cd,Status,Crm Cd 1,LAT,LON,year,month,dayofweek,is_weekend,hour
75319,2020-03-16,700,14,1416,1,310,33.0,M,W,501.0,...,500.0,IC,310.0,34.01,-118.4497,2020,3,0,0,7
655426,2023-12-09,1155,1,138,1,330,24.0,F,W,122.0,...,-1.0,IC,330.0,34.0461,-118.2395,2023,12,5,1,11
652607,2023-04-11,2051,8,897,2,745,68.0,F,B,502.0,...,-1.0,IC,745.0,34.0314,-118.3988,2023,4,1,0,20
80531,2020-05-27,1800,13,1375,2,624,52.0,F,H,104.0,...,400.0,IC,624.0,33.993,-118.2499,2020,5,2,0,18
858965,2023-06-21,800,1,192,2,745,46.0,M,O,122.0,...,-1.0,IC,745.0,34.0347,-118.2646,2023,6,2,0,8


In [5]:
# Hücre 5: Target seçimi ve one-hot encoding

TARGET_COL_ADI = 'Part 1-2'

if TARGET_COL_ADI not in df.columns:
    raise ValueError(f"Target kolonu bulunamadı: {TARGET_COL_ADI}")

print("Target değer dağılımı (NaN dahil):")
print(df[TARGET_COL_ADI].value_counts(dropna=False))

# Target NaN olan satırları at
df = df.dropna(subset=[TARGET_COL_ADI])
print("Target NaN temizlendikten sonra satır sayısı:", df.shape[0])

y = df[TARGET_COL_ADI]

X = df.drop(columns=[TARGET_COL_ADI])
print("Feature sayısı (target hariç):", X.shape[1])

# Kategorik kolonlar için one-hot encoding
X = pd.get_dummies(X, drop_first=True)

print("One-hot sonrası feature sayısı:", X.shape[1])
X.head()

Target değer dağılımı (NaN dahil):
Part 1-2
1    48283
2    31717
Name: count, dtype: int64
Target NaN temizlendikten sonra satır sayısı: 80000
Feature sayısı (target hariç): 20
One-hot sonrası feature sayısı: 2146


Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Crm Cd,Vict Age,Premis Cd,Weapon Used Cd,Crm Cd 1,LAT,LON,...,"Premis Desc_VEHICLE, PASSENGER/TRUCK",Premis Desc_VETERINARIAN/ANIMAL HOSPITAL,Premis Desc_VISION CARE FACILITY*,Premis Desc_WAREHOUSE,Premis Desc_WEBSITE,Premis Desc_YARD (RESIDENTIAL/BUSINESS),Status_AO,Status_IC,Status_JA,Status_JO
75319,700,14,1416,310,33.0,501.0,500.0,310.0,34.01,-118.4497,...,False,False,False,False,False,False,False,True,False,False
655426,1155,1,138,330,24.0,122.0,-1.0,330.0,34.0461,-118.2395,...,True,False,False,False,False,False,False,True,False,False
652607,2051,8,897,745,68.0,502.0,-1.0,745.0,34.0314,-118.3988,...,False,False,False,False,False,False,False,True,False,False
80531,1800,13,1375,624,52.0,104.0,400.0,624.0,33.993,-118.2499,...,False,False,False,False,False,False,False,True,False,False
858965,800,1,192,745,46.0,122.0,-1.0,745.0,34.0347,-118.2646,...,True,False,False,False,False,False,False,True,False,False


In [6]:
# Hücre 6: Train-test split

print("Target sınıf dağılımı:")
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Target sınıf dağılımı:
Part 1-2
1    48283
2    31717
Name: count, dtype: int64
X_train shape: (64000, 2146)
y_train shape: (64000,)
X_test shape: (16000, 2146)
y_test shape: (16000,)


In [7]:
# Hücre 7: NaN imputing

print("NaN sayısı (train, önce):", X_train.isna().sum().sum())
print("NaN sayısı (test,  önce):", X_test.isna().sum().sum())

imputer = SimpleImputer(strategy="median")

X_train_imp = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_imp = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("NaN sayısı (train, sonra):", X_train_imp.isna().sum().sum())
print("NaN sayısı (test,  sonra):", X_test_imp.isna().sum().sum())

NaN sayısı (train, önce): 1
NaN sayısı (test,  önce): 0
NaN sayısı (train, sonra): 0
NaN sayısı (test,  sonra): 0


In [8]:
# Hücre 8: SelectKBest ile feature azaltma

k_features = min(300, X_train_imp.shape[1])
print("k_features:", k_features)

select_kbest = SelectKBest(score_func=mutual_info_classif, k=k_features)

X_train_k = select_kbest.fit_transform(X_train_imp, y_train)
X_test_k = select_kbest.transform(X_test_imp)

kbest_mask = select_kbest.get_support()
kbest_features = X_train_imp.columns[kbest_mask]

print("SelectKBest sonrası feature sayısı:", X_train_k.shape[1])
print("İlk 20 feature:", kbest_features[:20].tolist())

k_features: 300
SelectKBest sonrası feature sayısı: 300
İlk 20 feature: ['TIME OCC', 'AREA', 'Rpt Dist No', 'Crm Cd', 'Vict Age', 'Premis Cd', 'Weapon Used Cd', 'Crm Cd 1', 'LAT', 'LON', 'year', 'month', 'dayofweek', 'is_weekend', 'hour', 'DATE OCC_2020-01-02', 'DATE OCC_2020-01-05', 'DATE OCC_2020-01-18', 'DATE OCC_2020-01-20', 'DATE OCC_2020-01-27']


In [9]:
# Hücre 9: StandardScaler ile ölçekleme

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_k)
X_test_scaled = scaler.transform(X_test_k)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

X_train_scaled shape: (64000, 300)
X_test_scaled shape: (16000, 300)


In [10]:
# Hücre 10: Linear SVC eğitim ve değerlendirme

svc_model = LinearSVC(
    C=1.0,
    class_weight='balanced',  # sınıf dengesizliği varsa yardımcı olur
    max_iter=5000,
    random_state=42
)

svc_model.fit(X_train_scaled, y_train)

y_pred_svc = svc_model.predict(X_test_scaled)

acc_svc = accuracy_score(y_test, y_pred_svc)
f1_macro_svc = f1_score(y_test, y_pred_svc, average='macro')
f1_weighted_svc = f1_score(y_test, y_pred_svc, average='weighted')

print("Linear SVC Accuracy:", acc_svc)
print("Linear SVC F1 (macro):", f1_macro_svc)
print("Linear SVC F1 (weighted):", f1_weighted_svc)

print("\nClassification Report (Linear SVC):")
print(classification_report(y_test, y_pred_svc))

print("Confusion Matrix (Linear SVC):")
print(confusion_matrix(y_test, y_pred_svc))

Linear SVC Accuracy: 0.908625
Linear SVC F1 (macro): 0.9038544943650645
Linear SVC F1 (weighted): 0.9082903685721847

Classification Report (Linear SVC):
              precision    recall  f1-score   support

           1       0.91      0.94      0.93      9657
           2       0.90      0.87      0.88      6343

    accuracy                           0.91     16000
   macro avg       0.91      0.90      0.90     16000
weighted avg       0.91      0.91      0.91     16000

Confusion Matrix (Linear SVC):
[[9051  606]
 [ 856 5487]]
