In [11]:
import re
import json
import gensim
import argparse
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Feature Engineering

In [12]:
def drop_columns(train_df, test_df, drop_cols = ['Descript', 'Resolution']):
    for col in drop_cols:
        if col in test_df.columns: test_df.drop(col, axis=1, inplace=True)
        if col in train_df.columns: train_df.drop(col, axis=1, inplace=True)
    return train_df, test_df

def basic_time_features(df):
    df["Dates"] = pd.to_datetime(df["Dates"], infer_datetime_format=True)    
    df["Day"] = df["Dates"].dt.day
    df["Hour"] = df["Dates"].dt.hour
    df["Year"] = df["Dates"].dt.year
    df["Month"] = df["Dates"].dt.month
    df["Minute"] = df["Dates"].dt.minute
    df["DayOfWeek"] = df["DayOfWeek"].astype(str)
    df['Night'] = df['Hour'].apply(lambda x: 1 if x > 6 and x < 18 else 0)
    df["Is_weekend"] = df["DayOfWeek"].isin(["Saturday", "Sunday"]).astype(int)
    
    df.drop('Dates', axis=1, inplace=True)
    return df

In [3]:
def remove_coord_outliers(train_df, test_df):
    # removes outlier datapoints like north pole or out of SF area. SF bounding box: lon [-123, -121], lat [37, 38]
    test_df.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
    train_df.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
    
    imputer = SimpleImputer(strategy='mean')
    for district in train_df['PdDistrict'].unique():
        train_df.loc[train_df['PdDistrict'] == district, ['X', 'Y']] = imputer.fit_transform(train_df.loc[train_df['PdDistrict'] == district, ['X', 'Y']])
        test_df.loc[test_df['PdDistrict'] == district, ['X', 'Y']] = imputer.transform(test_df.loc[test_df['PdDistrict'] == district, ['X', 'Y']])

    return train_df, test_df

def add_geo_clusters(train, test, num_clusters=150):

    test["X+Y"], test["X-Y"] = test["X"] + test["Y"], test["X"] - test["Y"]
    train["X+Y"], train["X-Y"] = train["X"] + train["Y"], train["X"] - train["Y"]

    test["XY_rad"] = np.sqrt(np.power(test['Y'], 2) + np.power(test['X'], 2))
    train["XY_rad"] = np.sqrt(np.power(train['Y'], 2) + np.power(train['X'], 2))

    # combine all coordinates
    coords = np.vstack([
        train[["X", "Y"]].values,
        test[["X", "Y"]].values,
    ])
    
    # impute missing values with median coordinates (better than 0)
    imputer = SimpleImputer(strategy='median')
    coords_imputed = imputer.fit_transform(coords)
    
    # impute and predict for both datasets
    test_coords_imputed = imputer.transform(test[["X", "Y"]].values)
    train_coords_imputed = imputer.transform(train[["X", "Y"]].values)


    # Adding PCA component
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2).fit(coords_imputed)
    test_pca = pca.transform(test_coords_imputed)
    train_pca = pca.transform(train_coords_imputed)
    test["XYpca1"], test["XYpca2"] = test_pca[:, 0], test_pca[:, 1]
    train["XYpca1"], train["XYpca2"] = train_pca[:, 0], train_pca[:, 1] 
    
    from sklearn.mixture import GaussianMixture
    clf = GaussianMixture(n_components=num_clusters, covariance_type="diag", random_state=0).fit(coords_imputed)
    test["GeoCluster"] = clf.predict(test_coords_imputed)
    train["GeoCluster"] = clf.predict(train_coords_imputed)

    return train, test

In [4]:
# parsing address: street, block, intersection
def parse_address(addr):
    if pd.isna(addr):
        return "UNKNOWN", "UNKNOWN"
    if "/" in addr:
        parts = [p.strip() for p in addr.split("/")]
        return "INTERSECTION", "_".join(sorted(parts))
    m = re.search(r"(\d+)\s+Block\s+of\s+(.+)", addr, flags=re.IGNORECASE)
    if m:
        return "BLOCK", m.group(2).strip()
    return "STREET", addr.strip()

# address embeddings
def address_encoding(train, test):
    # add intersection binary feature
    test['Intersection'] = test['Address'].apply(lambda x: 1 if '/' in x else 0)
    train['Intersection'] = train['Address'].apply(lambda x: 1 if '/' in x else 0)
    print(f"after intersection, train shape: {train.shape}, test shape: {test.shape}")


    train_length = len(train)

    combined = pd.concat([train, test], ignore_index=True)
    address_list = [address.split(' ') for address in combined['Address']]
    address_model = gensim.models.Word2Vec(address_list, min_count=1)

    address_embeddings = np.zeros((combined.shape[0], 100))
    for i in range(len(address_list)):
        for j in range(len(address_list[i])):
            address_embeddings[i] += address_model.wv[address_list[i][j]]
        address_embeddings[i] /= len(address_list[i])

    encoding_cols = []
    for i in range(address_embeddings.shape[1]):
        encoding_cols.append("EncodedAddress{}".format(i))
    
    encoding_address_df = pd.DataFrame(address_embeddings, columns=encoding_cols)
    combined = pd.concat([combined, encoding_address_df], axis=1, sort=False)
    combined.drop('Address', axis=1, inplace=True)

    train = combined[:train_length]
    test = combined[train_length:]
    
    return train, test, encoding_cols


In [5]:
def add_freq_encodings(train, test, cols):
    for c in cols:
        vc = train[c].fillna("__NA__").value_counts(dropna=False)
        train[f"{c}_freq"] = train[c].fillna("__NA__").map(vc).astype(int)
        test[f"{c}_freq"] = test[c].fillna("__NA__").map(vc).fillna(0).astype(int)


def fit_label_encoders(train, test, cat_cols):
    encoders = {}
    for c in cat_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train[c].astype(str), test[c].astype(str)], axis=0).astype(str))
        train[c + "_le"] = le.transform(train[c].astype(str))
        test[c + "_le"] = le.transform(test[c].astype(str))
        encoders[c] = le
    return encoders

### Processing

In [6]:
SEED = 42
num_clusters = 150
out_dir = "./dataset/"
test = "./dataset/test.csv"
train = "./dataset/train.csv"

print("loading dataset in csv format ...")
test = pd.read_csv(test, parse_dates=["Dates"])   # expects Kaggle format
train = pd.read_csv(train, parse_dates=["Dates"])  # expects Kaggle format

# drop duplicates
train.drop_duplicates(inplace=True)

debug = False
if debug:
    train = train.sample(8000, random_state=SEED).reset_index(drop=True)
    test = test.sample(2000, random_state=SEED).reset_index(drop=True)
print(f"train shape: {train.shape}, test shape: {test.shape}")

loading dataset in csv format ...
train shape: (875726, 9), test shape: (884262, 7)


In [7]:
# basic cleaning
train, test = remove_coord_outliers(train, test)
print(f"after basic cleaning, train shape: {train.shape}, test shape: {test.shape}")

# time features
test = basic_time_features(test)
train = basic_time_features(train)
print(f"after time feature engineering, train shape: {train.shape}, test shape: {test.shape}")

# drop columns: ['Descript', 'Resolution', 'Id']
train, test = drop_columns(train, test, drop_cols = ['Descript', 'Resolution'])
print(f"after dropping columns, train shape: {train.shape}, test shape: {test.shape}")

after basic cleaning, train shape: (875726, 9), test shape: (884262, 7)
after time feature engineering, train shape: (875726, 15), test shape: (884262, 13)
after dropping columns, train shape: (875726, 13), test shape: (884262, 13)


In [8]:
# address parsing
for df in (train, test):
    df[["AddrType", "StreetName"]] = df["Address"].apply(lambda x: pd.Series(parse_address(x)))
print(f"after address parsing, train shape: {train.shape}, test shape: {test.shape}")

# geo clustering (its not that useful to do)
train, test = add_geo_clusters(train, test, num_clusters=num_clusters)
print(f"after geo-clustering, train shape: {train.shape}, test shape: {test.shape}")

# frequency encodings
add_freq_encodings(train, test, ["StreetName", "AddrType", "PdDistrict", "DayOfWeek", "GeoCluster"])
print(f"after frequencry encoding, train shape: {train.shape}, test shape: {test.shape}")

# address embeddings
train, test, encoding_cols = address_encoding(train, test)
print(f"after address encoding, train shape: {train.shape}, test shape: {test.shape}")

after address parsing, train shape: (875726, 15), test shape: (884262, 15)
after geo-clustering, train shape: (875726, 21), test shape: (884262, 21)
after frequencry encoding, train shape: (875726, 26), test shape: (884262, 26)
after intersection, train shape: (875726, 27), test shape: (884262, 27)
after address encoding, train shape: (875726, 127), test shape: (884262, 127)


In [9]:
# label encoders
cat_cols = ["DayOfWeek", "PdDistrict", "AddrType", "GeoCluster"]
encoders = fit_label_encoders(train, test, cat_cols)

# numeric features and fill missing
bin_cols = ["Night", "Is_weekend", "Intersection"]
num_cols = ["X", "Y", "X+Y", "X-Y", "XY_rad", "XYpca1", "XYpca2", "Hour", "Month", "Year", "Day", 
            "Minute", "StreetName_freq", "AddrType_freq", "PdDistrict_freq", "DayOfWeek_freq", "GeoCluster_freq"]

for c in num_cols:
    if c not in train.columns:
        train[c] = 0
        test[c] = 0
    train[c] = train[c].fillna(-999)
    test[c] = test[c].fillna(-999)

# scale numerics
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])


# target encoding label
target_col = "Category"
le_target = LabelEncoder()
train["target"] = le_target.fit_transform(train[target_col].astype(str))

print(f"after processing, train shape: {train.shape}, test shape: {test.shape}")
feature_list = num_cols + bin_cols + encoding_cols + [c + "_le" for c in cat_cols]

after processing, train shape: (875726, 132), test shape: (884262, 131)


In [10]:
# save processed files
meta = {
    "bin_cols": bin_cols,
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "cat_cols_le": [c + "_le" for c in cat_cols],
    "encodings" : encoding_cols,
    "features": feature_list,
    "n_classes": int(train["target"].nunique()),
    "class_names": list(le_target.classes_),
    "gaussian_n_clusters": int(num_clusters)
}

meta_out = "./dataset/meta.json"
test_out = "./dataset/test_processed.csv"
train_out = "./dataset/train_processed.csv"


print("saving processed data to", out_dir)
train.to_csv(train_out, index=False)
test.to_csv(test_out, index=False)
with open(meta_out, "w") as f:
    json.dump(meta, f, indent=2)

print("processing done and files saved :")
for p in [meta_out, test_out,train_out ]:
    print(" -", p)

saving processed data to ./dataset/
processing done and files saved :
 - ./dataset/meta.json
 - ./dataset/test_processed.csv
 - ./dataset/train_processed.csv
