In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting Apartment Prices in Mexico City 🇲🇽

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import mean_absolute_error
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge

# Prepare Data

In [None]:
def wrangle(path):
    files=[file for file in os.listdir(path)]
    files
    all_data=pd.DataFrame()
    for file in files[1:]:
            df_temp = pd.read_excel(path+'/'+file)
            all_data = pd.concat([df_temp],ignore_index=True)
    all_data.shape
    #drop null value
    all_data=all_data.dropna()
    #split lat lon in new column
    all_data[["lat","lon"]]=all_data["lat-lon"].str.split(",",expand=True).astype(float)
    #drop lat-lon column
    all_data.drop(columns=all_data[["lat-lon"]],inplace=True)
    # Extract newColumnName
    all_data['borough'] = all_data["place_with_parent_names"].str.split("|", expand=True)[1]
    all_data.drop(columns="place_with_parent_names", inplace=True)
     # Drop feature with high null count
    all_data.drop(columns=["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True)
    
    # Drop low- and high- categorical variables
    all_data.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace=True)
    
    # Drop leaky columns
    all_data.drop(columns=["price", "price_aprox_local_currency", "price_per_m2"], inplace=True)
    return all_data

In [None]:
path=r"/kaggle/input/bonou-real"

In [None]:
df=wrangle(path)

In [None]:
df

# Explore

In [None]:
# Build histogram
plt.hist(df['price_aprox_usd'])


# Label axes
plt.xlabel('Price [$]')
plt.ylabel('Count')

# Add title
plt.title('Distribution of Apartment Prices')

In [None]:
# Build scatter plot
plt.scatter(y=df['price_aprox_usd'],x=df['surface_covered_in_m2'])
# Label axes
plt.xlabel('Area [sq meters]')
plt.ylabel('Price [USD]')
# Add title
plt.title('Mexico City: Price vs. Area')

In [None]:
X = df.drop('price_aprox_usd',axis=1)
y = df['price_aprox_usd']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)
baseline_mae = mean_absolute_error(y_train,y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

# Build Model

In [None]:
from sklearn.impute import SimpleImputer
model = make_pipeline(OneHotEncoder(use_cat_names=True),
             SimpleImputer(),
              Ridge()
             )
# Fit model
model.fit(X_train,y_train)

In [None]:
pred=model.predict(X_test)

In [None]:
# retrieve intercept
intercept = model.named_steps["ridge"].intercept_

# retrieve coefficients
coefficients = model.named_steps["ridge"].coef_

# retrieve names
features = model.named_steps["onehotencoder"].get_feature_names()

# create a series of names and values
feat_imp = pd.Series(coefficients, index=features)
feat_imp

In [None]:
# Build bar chart
feat_imp.sort_values(key=abs).tail(10).plot(kind='barh')

# Label axes
plt.xlabel('Importance [USD]')
plt.ylabel('Feature')
# Add title
plt.title('Feature Importances for Apartment Price')