In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(0)

In [2]:
file_path = 'modified_dataset/us_project_outliers.csv'

us_ds = pd.read_csv(file_path, sep=',')

In [3]:
us_ds['sin_year'] = np.sin(2 * np.pi * us_ds['year'] / 12)
us_ds['cos_year'] = np.cos(2 * np.pi * us_ds['year'] / 12)

us_ds['sin_month'] = np.sin(2 * np.pi * us_ds['month'] / 12)
us_ds['cos_month'] = np.cos(2 * np.pi * us_ds['month'] / 12)

In [4]:
us_ds.head(100)

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,price_increased_mom,price_increased_yoy,inventory_turnover,sale_to_list_ppsf_ratio,supply_demand_balance,fast_selling,sin_year,cos_year,sin_month,cos_month
0,2017-09-01,2017-09-30,30,place,6,29470,f,"Chicago, IL",Chicago,Illinois,...,1,1,0.215805,0.517397,55.0,0,0.500000,8.660254e-01,-1.000000e+00,-1.836970e-16
1,2020-07-01,2020-07-31,30,place,6,37598,f,"Parsippany, NJ",Parsippany,New Jersey,...,0,0,0.700000,0.975611,3.0,0,0.866025,-5.000000e-01,-5.000000e-01,-8.660254e-01
2,2021-08-01,2021-08-31,30,place,6,24993,f,"Oakbrook, KY",Oakbrook,Kentucky,...,0,1,1.700000,0.925068,3.0,1,0.500000,-8.660254e-01,-8.660254e-01,-5.000000e-01
3,2018-08-01,2018-08-31,30,place,6,29754,f,"Dunstable, MA",Dunstable,Massachusetts,...,1,0,0.400000,1.158641,0.0,0,0.866025,5.000000e-01,-8.660254e-01,-5.000000e-01
4,2023-01-01,2023-01-31,30,place,6,10728,f,"Kalamazoo, MI",Kalamazoo,Michigan,...,1,1,0.415842,0.996442,-8.0,1,-0.500000,-8.660254e-01,5.000000e-01,8.660254e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2024-07-01,2024-07-31,30,place,6,16583,f,"Saco, ME",Saco,Maine,...,0,0,0.500000,1.153662,-2.0,1,-0.866025,-5.000000e-01,-5.000000e-01,-8.660254e-01
96,2020-01-01,2020-01-31,30,place,6,10233,f,"Lancaster, CA",Lancaster,California,...,0,1,0.496795,1.003815,-37.0,0,0.866025,-5.000000e-01,5.000000e-01,8.660254e-01
97,2013-09-01,2013-09-30,30,place,6,24533,f,"Ocean Bluff-Brant Rock, MA",Ocean Bluff-Brant Rock,Massachusetts,...,0,0,0.146341,0.866614,-3.0,0,-1.000000,7.792901e-14,-1.000000e+00,-1.836970e-16
98,2024-06-01,2024-06-30,30,place,6,15188,f,"Ogden, IA",Ogden,Iowa,...,0,1,0.857143,1.678848,-2.0,0,-0.866025,-5.000000e-01,1.224647e-16,-1.000000e+00


In [5]:
def add_intercept(us_ds):
    print("In add_intercept function...")
    us_ds_with_bias = np.c_[np.ones(us_ds.shape[0]), us_ds]
    return us_ds_with_bias

def normalize(us_ds, mean=None, std=None):
    print("In normalize function...")
    us_ds_normalized = us_ds.copy()
    
    # Normalize only the selected numeric columns
    us_ds_normalized = (us_ds_normalized - mean) / std
    
    # us_ds_normalized = (us_ds - mean) / std
    return us_ds_normalized

def split_data(us_ds, state_column, ratio=0.8):
    train_data = []
    test_data = []
    
    # grouped by the state column
    grouped = us_ds.groupby(state_column)
    
    for state, group in grouped:
        num_samples = len(group)
        n_train = int(num_samples * ratio)
        
        # shuffle indices for randomization
        indices = np.random.permutation(num_samples)
        
        train_indices = indices[:n_train]
        test_indices = indices[n_train:]
        
        # split into train and test for this state
        train_group = group.iloc[train_indices]
        test_group = group.iloc[test_indices]
        
        # append to overall train and test sets
        train_data.append(train_group)
        test_data.append(test_group)
    
    print("In split function...")
    # print(train_data[0]['state_code'])
    print(f"train_data len: {len(train_data)}")
    print(f"test_data len: {len(test_data)}")
    print("Train shape")
    print(train_data[0].shape)
    print("Test shape")
    print(test_data[0].shape)
    
    # concatenate all groups to form the final train and test datasets
    print("Start concatenation of train data frames...")
    train_data = pd.concat(train_data).reset_index(drop=True)
    print("Start concatenation of test data frames...")
    test_data = pd.concat(test_data).reset_index(drop=True)
    
    return train_data, test_data

# def split_data_ca(us_ds, ratio=0.8):
#     us_ds_ca = us_ds[us_ds['state_code'] == 'CA']
#     
#     num_samples = us_ds_ca.shape[0]
#     n_train = int(num_samples * ratio)
#     indices = np.random.permutation(num_samples)
#     
#     train_indices = indices[:n_train]
#     test_indices = indices[n_train:]
#     
#     us_ds_ca_train = us_ds_ca.iloc[train_indices]
#     us_ds_ca_test = us_ds_ca.iloc[test_indices]
#     
#     return us_ds_ca_train, us_ds_ca_test

def preprocess_data(us_ds, state_column, ratio=0.8):
    us_ds_train, us_ds_test = None, None
    
    # split the data
    print("Start splitting...")
    us_ds_train, us_ds_test = split_data(us_ds, state_column, ratio)
    
    print("In process_data function...")
    print("Train shape")
    print(us_ds_train.shape)
    print("Test shape")
    print(us_ds_test.shape)
    
    exclude_columns = ['period_duration', 'region_type_id', 'table_id', 'property_type_id', 'parent_metro_region_metro_code', 'year', 'month', 'price_increased_mom', 'price_increased_yoy', 'fast_selling']
    
    # compute the mean and std of the training data
    print("Start computing mean and std...")
    numeric_columns = us_ds_train.select_dtypes(include=['number']).columns

    if exclude_columns:
        columns_for_mean_std = [col for col in numeric_columns if col not in exclude_columns]
    else:
        columns_for_mean_std = numeric_columns
    
    mean = np.mean(us_ds_train[columns_for_mean_std], axis=0)
    std = np.std(us_ds_train[columns_for_mean_std], axis=0)
    
    print("Preprocessing the entire data set...")
    us_ds_final = preprocess_entire_dataset(us_ds, mean, std)
    
    print("Continuing with the split data...")
    # normalize the data
    print("Start normalizing...")
    # Normalize the training data
    
    us_ds_train_normalized = us_ds_train.copy()
    us_ds_test_normalized = us_ds_test.copy()
    
    us_ds_train_normalized[columns_for_mean_std] = normalize(us_ds_train[columns_for_mean_std], mean, std)
    us_ds_test_normalized[columns_for_mean_std] = normalize(us_ds_test[columns_for_mean_std], mean, std)
     
    # add intercept to both training and testing data
    print("Start adding intercept...")
    train_intercept = add_intercept(us_ds_train_normalized)
    test_intercept = add_intercept(us_ds_test_normalized)
     
    print("Finishing...")
    us_ds_train = train_intercept
    us_ds_test = test_intercept
    
    return us_ds_train, us_ds_test, us_ds_final

def preprocess_entire_dataset(us_ds, mean, std):
    exclude_columns = ['period_duration', 'region_type_id', 'table_id', 'property_type_id',
    'parent_metro_region_metro_code', 'year', 'month', 'price_increased_mom',
    'price_increased_yoy', 'fast_selling']
    
    numeric_columns = us_ds.select_dtypes(include=['number']).columns
    
    if exclude_columns:
        columns_to_normalize = [col for col in numeric_columns if col not in exclude_columns]
    else:
        columns_to_normalize = numeric_columns
    
    # normalize
    print("Start normalizing...")
    print("In preprocess entire...")
    us_ds_normalized = us_ds.copy()
    us_ds_normalized[columns_to_normalize] = normalize(us_ds[columns_to_normalize], mean, std)
    
    # add intercept
    print("Start adding intercept...")
    us_ds_intercept = add_intercept(us_ds_normalized)
    
    # finishing
    print("Finishing...")
    us_ds_final = us_ds_intercept
    
    return us_ds_final

In [6]:
state_column = 'state_code'
us_ds_train, us_ds_test, us_ds_final = preprocess_data(us_ds, state_column)

print("The shape of the training set is:")
print(us_ds_train.shape)
print("The shape of the test set is:")
print(us_ds_test.shape)

Start splitting...
In split function...
train_data len: 48
test_data len: 48
Train shape
(942, 72)
Test shape
(236, 72)
Start concatenation of train data frames...
Start concatenation of test data frames...
In process_data function...
Train shape
(1431347, 72)
Test shape
(357865, 72)
Start computing mean and std...
Preprocessing the entire data set...
Start normalizing...
In preprocess entire...
In normalize function...
Start adding intercept...
In add_intercept function...
Finishing...
Continuing with the split data...
Start normalizing...
In normalize function...
In normalize function...
Start adding intercept...
In add_intercept function...
In add_intercept function...
Finishing...
The shape of the training set is:
(1431347, 73)
The shape of the test set is:
(357865, 73)


In [7]:
output_file = 'modified_dataset/us_project_normalize_train.csv'

column_names = ['intercept'] + list(us_ds.columns)
us_ds_train = pd.DataFrame(us_ds_train, columns=column_names)

us_ds_train.to_csv(output_file, index=False)

In [8]:
output_file = 'modified_dataset/us_project_normalize_test.csv'

us_ds_test = pd.DataFrame(us_ds_test, columns=column_names)

us_ds_test.to_csv(output_file, index=False)

In [9]:
# def group_by_state(us_ds, state_column):
#     # grouped by the state column
#     grouped = us_ds.groupby(state_column)
#     print("What states are in the dataset")
#     cnt = 0
#     for state, group in grouped:
#         print(f"{state}: {len(group)}")
#         cnt+=1
#     print("Total number of states:", cnt)

In [10]:
us_ds_final.shape

(1789212, 73)

In [11]:
output_file = 'modified_dataset/us_project_normalize.csv'

column_names = ['intercept'] + list(us_ds.columns)
us_ds_final = pd.DataFrame(us_ds_final, columns=column_names)

us_ds_final.to_csv(output_file, index=False)