### LA Data Preprocessing and split

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.read_csv('datasets/LosAngeles/listings_with_IncEduCri_LA_2.csv')
df = df.dropna(how = 'any')

In [2]:
parsed_cols = pd.get_dummies(df[['property_type','room_type']])
df = df.drop(columns = ['property_type','room_type'])
df1 = pd.concat([df,parsed_cols],axis =1)
df1.head()

Unnamed: 0,id,price,zipcode,geometry,latitude,longitude,accommodates,bathrooms,bedrooms,beds,...,property_type_Tiny house,property_type_Townhouse,property_type_Treehouse,property_type_Vacation home,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,2708,79,90046,POINT (-118.34602 34.09768),34.09768,-118.34602,1,1.5,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,6931,99,90046,POINT (-118.34801 34.09520999999999),34.09521,-118.34801,1,1.5,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
2,50005,130,90046,POINT (-118.34985 34.09596),34.09596,-118.34985,2,1.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
3,61153,140,90046,POINT (-118.38127 34.11046),34.11046,-118.38127,2,1.5,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
4,132908,1050,90046,POINT (-118.37002 34.10456),34.10456,-118.37002,10,4.0,5.0,5.0,...,0,0,0,0,1,0,1,0,0,0


In [3]:
print(df1.shape)

(17975, 60)


In [4]:
df1.iloc[:,1].str.strip().str.replace('-','')
df1['price_y'] = pd.to_numeric(df1.iloc[:,1], errors='coerce')
df1 = df1.drop(columns = ['price '])

In [5]:
import random
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
random.seed(13)

def normalize(X_train, X_val, X_test):
    X_train_normalized = X_train.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(X_train_normalized)
    x = pd.DataFrame(x_scaled, columns=X_train.columns)

    X_val_normalized = X_val.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_val_scaled = min_max_scaler.fit_transform(X_val_normalized)
    x_v = pd.DataFrame(x_val_scaled, columns=X_val.columns)

    X_test_normalized = X_test.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_test_scaled = min_max_scaler.fit_transform(X_test_normalized)
    x_t = pd.DataFrame(x_test_scaled, columns=X_test.columns)
    return x, x_v, x_t

In [6]:
def split(dataset, val_frac=0.10, test_frac=0.10):
    X = dataset.loc[:, dataset.columns != 'price_y']
    X = X.loc[:, X.columns != 'id']
    X = X.loc[:, X.columns != 'zipcode']
    X = X.loc[:, X.columns != 'geometry']
    X = X.loc[:, X.columns != 'latitude']
    X = X.loc[:, X.columns != 'longitude']

    y = dataset['price_y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(val_frac+test_frac), random_state=1)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_frac/(val_frac+test_frac), random_state=1)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [7]:
X_train, y_train, X_val, y_val, X_test, y_test = split(df1)

X_train, X_val, X_test = normalize(X_train, X_val, X_test)
X_train.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_train_X.csv', header=True, index=False)
y_train.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_train_y.csv', header=True, index=False)

X_val.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_val_X.csv', header=True, index=False)
y_val.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_val_y.csv', header=True, index=False)

X_test.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_test_X.csv', header=True, index=False)
y_test.to_csv('./datasets/LosAngeles/cleaned/data_cleaned_test_y.csv', header=True, index=False)

### NYC Data Preprocessing and split

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df_nyc = pd.read_csv('datasets/NewYork/listings_with_IncEduCri_NYC_2.csv')
df_nyc = df_nyc.dropna(how = 'any')

In [11]:
parsed_cols = pd.get_dummies(df_nyc[['property_type','room_type']])
df_nyc = df_nyc.drop(columns = ['property_type','room_type'])
df_nyc1 = pd.concat([df_nyc,parsed_cols],axis =1)
df_nyc1.head()

Unnamed: 0,id,price,Zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,beds,minimum_nights,...,property_type_Timeshare,property_type_Tiny house,property_type_Townhouse,property_type_Treehouse,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,15396.0,375.0,10001,40.74623,-73.9953,4.0,2.0,2.0,2.0,180.0,...,0,0,0,0,0,0,1,0,0,0
1,37042.0,150.0,10001,40.75295,-73.99801,4.0,1.0,2.0,2.0,10.0,...,0,0,0,0,0,0,1,0,0,0
2,51572.0,123.0,10001,40.74859,-73.99671,2.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
3,98663.0,130.0,10001,40.74893,-73.99544,2.0,1.0,1.0,1.0,30.0,...,0,0,0,0,0,0,1,0,0,0
4,110739.0,149.0,10001,40.74503,-73.98876,2.0,1.0,0.0,2.0,30.0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
print(df_nyc1.shape)

(39491, 59)


In [13]:
import random
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
random.seed(13)

def normalize(X_train, X_val, X_test):
    X_train_normalized = X_train.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(X_train_normalized)
    x = pd.DataFrame(x_scaled, columns=X_train.columns)

    X_val_normalized = X_val.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_val_scaled = min_max_scaler.fit_transform(X_val_normalized)
    x_v = pd.DataFrame(x_val_scaled, columns=X_val.columns)

    X_test_normalized = X_test.values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_test_scaled = min_max_scaler.fit_transform(X_test_normalized)
    x_t = pd.DataFrame(x_test_scaled, columns=X_test.columns)
    return x, x_v, x_t

In [14]:
def splitNYC(dataset, val_frac=0.10, test_frac=0.10):
    X = dataset.loc[:, dataset.columns != 'price']
    X = X.loc[:, X.columns != 'id']
    X = X.loc[:, X.columns != 'Zipcode']
    X = X.loc[:, X.columns != 'latitude']
    X = X.loc[:, X.columns != 'longitude']

    y = dataset['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(val_frac+test_frac), random_state=1)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_frac/(val_frac+test_frac), random_state=1)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [15]:
X_train, y_train, X_val, y_val, X_test, y_test = splitNYC(df_nyc1)

X_train, X_val, X_test = normalize(X_train, X_val, X_test)
X_train.to_csv('./datasets/NewYork/cleaned/data_cleaned_train_X.csv', header=True, index=False)
y_train.to_csv('./datasets/NewYork/cleaned/data_cleaned_train_y.csv', header=True, index=False)

X_val.to_csv('./datasets/NewYork/cleaned/data_cleaned_val_X.csv', header=True, index=False)
y_val.to_csv('./datasets/NewYork/cleaned/data_cleaned_val_y.csv', header=True, index=False)

X_test.to_csv('./datasets/NewYork/cleaned/data_cleaned_test_X.csv', header=True, index=False)
y_test.to_csv('./datasets/NewYork/cleaned/data_cleaned_test_y.csv', header=True, index=False)