# Importación de dependencias y de los archivos parquet

In [2]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
train_dask = dd.read_parquet('../data/raw/train')
test_dask = dd.read_parquet('../data/raw/test')
train_dataset = train_dask.compute()
test_dataset = test_dask.compute()

#### Limpieza de campos

In [4]:
train_dataset.drop(['id','url','region_url','image_url','description'],axis=1,inplace=True)
test_dataset.drop(['id','url','region_url','image_url','description'],axis=1,inplace=True)
train_dataset.baths = train_dataset.baths.astype(np.int64)
test_dataset.baths = test_dataset.baths.astype(np.int64)

#### Codificación de variables categoricas

In [5]:
oho = OneHotEncoder(dtype=np.int64,handle_unknown='ignore',sparse=False)
X = train_dataset[['laundry_options','parking_options']]
oho_df = pd.DataFrame(oho.fit_transform(X))
oho_df.index = train_dataset.index
oho_df.columns = oho.get_feature_names_out()
train_dataset.drop(['laundry_options','parking_options'], axis=1, inplace=True)
train_dataset = pd.concat([train_dataset, oho_df], axis=1)




In [6]:
oho = OneHotEncoder(dtype=np.int64,handle_unknown='ignore',sparse=False)
X = test_dataset[['laundry_options','parking_options']]
oho_df = pd.DataFrame(oho.fit_transform(X))
oho_df.index = test_dataset.index
oho_df.columns = oho.get_feature_names_out()
test_dataset.drop(['laundry_options','parking_options'], axis=1, inplace=True)
test_dataset = pd.concat([test_dataset, oho_df], axis=1)



In [7]:
le = LabelEncoder()
le_df = pd.DataFrame(le.fit_transform(train_dataset[['state']]))
le_df.columns = ['stateid']
train_dataset = pd.concat([train_dataset, le_df], axis=1)
le_df = pd.DataFrame(le.fit_transform(train_dataset[['type']]))
le_df.columns = ['typeid']
train_dataset = pd.concat([train_dataset, le_df], axis=1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [8]:
le = LabelEncoder()
le_df = pd.DataFrame(le.fit_transform(test_dataset[['state']]))
le_df.columns = ['stateid']
test_dataset = pd.concat([test_dataset, le_df], axis=1)
le_df = pd.DataFrame(le.fit_transform(test_dataset[['type']]))
le_df.columns = ['typeid']
test_dataset = pd.concat([test_dataset, le_df], axis=1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [9]:
pd.set_option('display.max_rows', 0)
pd.set_option('display.max_columns', 30)

test_dataset[test_dataset.isnull().any(axis=1)].head()

Unnamed: 0,region,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,state,laundry_options_laundry in bldg,laundry_options_laundry on site,laundry_options_no laundry on site,laundry_options_w/d hookups,laundry_options_w/d in unit,laundry_options_None,parking_options_attached garage,parking_options_carport,parking_options_detached garage,parking_options_no parking,parking_options_off-street parking,parking_options_street parking,parking_options_valet parking,parking_options_None,stateid,typeid
252,austin,apartment,428,0,1,1,1,1,0,0,0,,,tx,1,0,0,0,0,0,0,0,1,0,0,0,0,0,43,0
320,ames,townhouse,1123,3,1,1,0,0,0,0,1,,,ia,0,0,0,0,1,0,0,0,0,0,1,0,0,0,12,9
524,toledo,apartment,1020,2,1,0,1,1,0,0,0,,,oh,0,0,0,0,0,1,1,0,0,0,0,0,0,0,35,0
626,knoxville,apartment,550,1,1,1,1,1,0,0,0,,,tn,0,1,0,0,0,0,0,0,0,0,1,0,0,0,42,0
784,wichita,house,2800,3,2,1,1,0,0,0,0,,,ks,0,0,0,1,0,0,1,0,0,0,0,0,0,0,16,5


#### Limpieza de datos

In [10]:
train_dataset.drop(train_dataset[train_dataset.baths > 8].index,inplace=True)
train_dataset.drop(train_dataset[(train_dataset.baths > 4) & (train_dataset.type == 'apartment')].index,inplace=True)

In [11]:
train_dataset.drop(train_dataset[train_dataset.beds > 8].index,inplace=True)

In [12]:
upperlimit = (train_dataset['sqfeet'].std() * 1.5) + train_dataset['sqfeet'].median()
train_dataset.drop(train_dataset[train_dataset.sqfeet >= upperlimit].index,inplace=True)

In [13]:
train_dataset.drop(train_dataset[train_dataset.lat > 100].index,inplace=True)
train_dataset.drop(train_dataset[train_dataset.lat < -15].index,inplace=True)
train_dataset.drop(train_dataset[train_dataset.long > -30].index,inplace=True)

In [14]:
train_dataset.reset_index(drop=True, inplace=True)

In [15]:
train_dataset[train_dataset.isnull().any(axis=1)].head()

Unnamed: 0,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,state,...,laundry_options_laundry on site,laundry_options_no laundry on site,laundry_options_w/d hookups,laundry_options_w/d in unit,laundry_options_None,parking_options_attached garage,parking_options_carport,parking_options_detached garage,parking_options_no parking,parking_options_off-street parking,parking_options_street parking,parking_options_valet parking,parking_options_None,stateid,typeid
62,worcester / central MA,1450,townhouse,1100,2,2,0,0,1,0,0,0,,,ma,...,0,0,0,1,0,1,0,0,0,0,0,0,0,19,11
369,grand rapids,1165,apartment,892,2,1,1,1,1,0,0,0,,,mi,...,0,0,1,0,0,0,0,0,0,0,0,0,1,22,0
501,denver,2456,apartment,932,2,2,1,1,0,0,0,0,,,co,...,0,0,0,1,0,1,0,0,0,0,0,0,0,5,0
840,columbia,975,apartment,964,2,2,1,1,0,1,0,0,,,sc,...,0,0,0,1,0,0,0,0,0,0,0,0,1,40,0
975,sarasota-bradenton,1800,house,2300,2,1,0,0,0,0,0,1,,,fl,...,0,0,1,0,0,1,0,0,0,0,0,0,0,9,6


In [16]:
train_dask = dd.from_pandas(train_dataset,npartitions=4)
train_dask.to_parquet('../data/processed/train',name_function=lambda x: f"train-{x}.parquet")
test_dask = dd.from_pandas(test_dataset,npartitions=2)
test_dask.to_parquet('../data/processed/test',name_function=lambda x: f"test-{x}.parquet")
