In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler


In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def preprocess_data(traindata, testdata):
    y_train = traindata['gname']
    y_test = testdata['gname']

    train_features = traindata.drop(columns=['gname'])
    test_features = testdata.drop(columns=['gname'])

    numeric_cols = train_features.columns.tolist()

    # Unscaled features
    X_train_unscaled = train_features.copy()
    X_test_unscaled = test_features.copy()

    # Scale numeric features
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_features), columns=numeric_cols, index=train_features.index)
    X_test_scaled = pd.DataFrame(scaler.transform(test_features), columns=numeric_cols, index=test_features.index)

    # Add target column back
    unscaled_train = pd.concat([X_train_unscaled, y_train], axis=1)
    unscaled_test = pd.concat([X_test_unscaled, y_test], axis=1)

    scaled_train = pd.concat([X_train_scaled, y_train], axis=1)
    scaled_test = pd.concat([X_test_scaled, y_test], axis=1)

    return unscaled_train, unscaled_test, scaled_train, scaled_test


In [3]:
if not os.path.isdir("scaledtrain1"):
    os.mkdir("scaledtrain1")
if not os.path.isdir("scaledtest1"):
    os.mkdir("scaledtest1")
if not os.path.isdir("train1"):
    os.mkdir("train1")
if not os.path.isdir("test1"):
    os.mkdir("test1")
if not os.path.isdir("combined"):
    os.mkdir("combined")

In [4]:
trainpath = '../traindata'
testpath = '../testdata'

partitions = [100, 200, 300, 478]


for partition in partitions:
    traindata = pd.read_csv(f'{trainpath}/train{partition}.csv', encoding='ISO-8859-1')
    testdata = pd.read_csv(f'{testpath}/test{partition}.csv', encoding='ISO-8859-1')

    # Drop irrelevant columns
    cols_to_drop = ['Unnamed: 0', 'country', 'city', 'region', 'provstate', 'natlty1', 'specificity', 'iyear', 'imonth', 'iday']
    traindata = traindata.drop(columns=cols_to_drop)
    testdata = testdata.drop(columns=cols_to_drop)

    # Preprocess features
    traindata, testdata, scaledtrain, scaledtest = preprocess_data(traindata, testdata)

    combined = pd.concat([traindata, testdata])

    print(f'Size train: {len(traindata)}, Shape train: {traindata.shape}')
    print(f'Size test: {len(testdata)}, Shape test: {testdata.shape}')
    
    traindata.to_csv(f'train1/train{partition}.csv', index=False)
    testdata.to_csv(f'test1/test{partition}.csv', index=False)
    combined.to_csv(f'combined/combined{partition}.csv', index=False)
    scaledtrain.to_csv(f'scaledtrain1/train{partition}.csv', index=False)
    scaledtest.to_csv(f'scaledtest1/test{partition}.csv', index=False)

Size train: 2100, Shape train: (2100, 16)
Size test: 900, Shape test: (900, 16)
Size train: 4200, Shape train: (4200, 16)
Size test: 1800, Shape test: (1800, 16)
Size train: 6300, Shape train: (6300, 16)
Size test: 2700, Shape test: (2700, 16)
Size train: 10020, Shape train: (10020, 16)
Size test: 4320, Shape test: (4320, 16)
