<a href="https://colab.research.google.com/github/aceballosGitHub/eda_couse/blob/main/Airbnb_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objetivo: Predecir el precio de los sitios de hospedaje en Airbnb

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import pandas as pd

# Importar Airbnb dataset desde Drive
url='https://drive.google.com/uc?id=1rdcdB90FtHXCYN-Pz0tg8jIgP_LcNi8k'
df = pd.read_csv(url)

In [None]:
# Obtener información del tamaño del dataset y tipos de datos de cada columna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

In [None]:
# Eliminar columna id
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,NYC,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,NYC,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,NYC,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,SF,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,DC,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [None]:
# Indicar columna objetivo (target)
target = 'log_price'
features = ['amenities',	'accommodates',	'bathrooms']

X = df.drop(columns=[target])
y = df[target]

print(X.shape, y.shape)

(74111, 27) (74111,)


In [None]:
# Seleccionar features categoricas y numericas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

In [None]:
# Preparar los datos
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
  ])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
  ])

preparation = ColumnTransformer([
    ('numerical', numeric_pipeline, numeric_features),
    ('categorical', categorical_pipeline, categorical_features)
    ])

In [None]:
# Crear el pipeline
pipeline = Pipeline([
    ('preparation', preparation),
    ('dimensionality_reduction', TruncatedSVD(n_components=20)),
    ('regressor', LinearRegression())
  ], verbose = True
)

In [None]:
# Divide el dataset en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar la data al pipeline
pipeline.fit(X_train, y_train)

[Pipeline] ....... (step 1 of 3) Processing preparation, total=   2.1s
[Pipeline]  (step 2 of 3) Processing dimensionality_reduction, total=   4.5s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.0s


In [None]:
# Evaluar el pipeline con el set de prueba
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5216490456577121


### Mejor accuracy se obtiene sin reducir las dimensiones del dataset

In [None]:
# Crear el pipeline
pipeline = Pipeline([
    ('preparation', preparation),
    ('regressor', LinearRegression())
  ]
)

# Divide el dataset en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar la data al pipeline
pipeline.fit(X_train, y_train)

# Evaluar el pipeline con el set de prueba
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.6736489624685099
