In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler


In [6]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def preprocess_data(traindata, testdata):
    y_train = traindata['gname']
    y_test = testdata['gname']

    train_features = traindata.drop(columns=['gname'])
    test_features = testdata.drop(columns=['gname'])

    geodata = ['longitude', 'latitude']
    numeric_cols = [col for col in train_features.columns if col not in geodata]

    # Create longlat tuple feature and one-hot encode it
    combined_geo = pd.concat([train_features[geodata], test_features[geodata]])
    combined_geo['longlat'] = list(zip(combined_geo['longitude'], combined_geo['latitude']))
    combined_geo = combined_geo.drop(columns=geodata)
    geo_onehot = pd.get_dummies(combined_geo, columns=['longlat'])

    train_geo = geo_onehot.iloc[:len(train_features)]
    test_geo = geo_onehot.iloc[len(train_features):]

    # Keep unscaled numeric features
    train_num_unscaled = train_features[numeric_cols]
    test_num_unscaled = test_features[numeric_cols]

    # Scale numeric features
    scaler = StandardScaler()
    train_num_scaled = pd.DataFrame(scaler.fit_transform(train_num_unscaled), columns=numeric_cols, index=train_features.index)
    test_num_scaled = pd.DataFrame(scaler.transform(test_num_unscaled), columns=numeric_cols, index=test_features.index)

    # Combine scaled features with geo one-hot features
    X_train_scaled = pd.concat([train_num_scaled, train_geo], axis=1)
    X_test_scaled = pd.concat([test_num_scaled, test_geo], axis=1)

    # Combine unscaled features with geo one-hot features
    X_train_unscaled = pd.concat([train_num_unscaled, train_geo], axis=1)
    X_test_unscaled = pd.concat([test_num_unscaled, test_geo], axis=1)

    # Add target column back
    scaled_train = pd.concat([X_train_scaled, y_train], axis=1)
    scaled_test = pd.concat([X_test_scaled, y_test], axis=1)

    unscaled_train = pd.concat([X_train_unscaled, y_train], axis=1)
    unscaled_test = pd.concat([X_test_unscaled, y_test], axis=1)

    return unscaled_train, unscaled_test, scaled_train, scaled_test



In [7]:
if not os.path.isdir("scaledtrain1"):
    os.mkdir("scaledtrain1")
if not os.path.isdir("scaledtest1"):
    os.mkdir("scaledtest1")
if not os.path.isdir("train1"):
    os.mkdir("train1")
if not os.path.isdir("test1"):
    os.mkdir("test1")
if not os.path.isdir("combined"):
    os.mkdir("combined")

In [8]:
trainpath = '../traindata'
testpath = '../testdata'

partitions = [100, 200, 300, 478]


for partition in partitions:
    traindata = pd.read_csv(f'{trainpath}/train{partition}.csv', encoding='ISO-8859-1')
    testdata = pd.read_csv(f'{testpath}/test{partition}.csv', encoding='ISO-8859-1')

    # Drop irrelevant columns
    cols_to_drop = ['Unnamed: 0', 'country', 'city', 'region', 'provstate', 'natlty1', 'specificity', 'iyear', 'imonth', 'iday']
    traindata = traindata.drop(columns=cols_to_drop)
    testdata = testdata.drop(columns=cols_to_drop)

    # Preprocess features
    traindata, testdata, scaledtrain, scaledtest = preprocess_data(traindata, testdata)

    combined = pd.concat([traindata, testdata])

    print(f'Size train: {len(traindata)}, Shape train: {traindata.shape}')
    print(f'Size test: {len(testdata)}, Shape test: {testdata.shape}')
    
    traindata.to_csv(f'train1/train{partition}.csv', index=False)
    testdata.to_csv(f'test1/test{partition}.csv', index=False)
    combined.to_csv(f'combined/combined{partition}.csv', index=False)
    scaledtrain.to_csv(f'scaledtrain1/train{partition}.csv', index=False)
    scaledtest.to_csv(f'scaledtest1/test{partition}.csv', index=False)

Size train: 2100, Shape train: (2100, 1804)
Size test: 900, Shape test: (900, 1804)
Size train: 4200, Shape train: (4200, 3174)
Size test: 1800, Shape test: (1800, 3174)
Size train: 6300, Shape train: (6300, 4393)
Size test: 2700, Shape test: (2700, 4393)
Size train: 10020, Shape train: (10020, 6310)
Size test: 4320, Shape test: (4320, 6310)
