**Please upvote my notebook if You like it! :)**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install reverse_geocoder

**Downloading data**

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e1/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e1/sample_submission.csv')
train_df = train_df.drop('id', axis=1)
train_df

In [None]:
extra_data = fetch_california_housing()
train_data2 = pd.DataFrame(extra_data['data'])
train_data2['MedHouseVal'] = extra_data['target']
train_data2.columns = train_df.columns
train_df['generated'] = 1
test_df['generated'] = 1
train_data2['generated'] = 0
train_df = pd.concat([train_df, train_data2],axis=0).drop_duplicates()
print(train_df.shape)
train_df.head()

Thanks to @dmitryuarov for feature engineering ideas with coordinates! Please, upvote his notebook: https://www.kaggle.com/code/dmitryuarov/ps-s3e1-coordinates-key-to-victory

In [None]:
def crt_crds(df): 
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                      (np.sin(np.radians(15)) * df['Latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                      (np.sin(np.radians(15)) * df['Longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                      (np.sin(np.radians(30)) * df['Latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + \
                      (np.sin(np.radians(30)) * df['Longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + \
                      (np.sin(np.radians(45)) * df['Latitude'])
    return df

train = crt_crds(train_df)
test = crt_crds(test_df)

In [None]:
import reverse_geocoder as rg
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def geocoder(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    return results

results = geocoder(train_df)
train_df['place'] = [x['admin2'] for x in results]
results = geocoder(test_df)
test_df['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
train_df['place'] = train_df['place'].apply(lambda x: replace(x))
test_df['place'] = test_df['place'].apply(lambda x: replace(x))

# le = LabelEncoder()
# train_df['place'] = le.fit_transform(train_df['place'])
# test_df['place'] = le.transform(test_df['place'])
test_df = pd.get_dummies(test_df)
train_df = pd.get_dummies(train_df)

In [None]:
train_df

In [None]:
test_df

No missing data in our datasets

In [None]:
train_df.isna().any()

In [None]:
X = train_df.drop('MedHouseVal', axis=1)
y = train_df.MedHouseVal
X_test = test_df.drop('id', axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X)
train_sc = scaler.transform(X)
test_sc = scaler.transform(X_test)
X = pd.DataFrame(train_sc, columns = X.columns)
X_test = pd.DataFrame(test_sc, columns = X_test.columns)

In [None]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

**Keras model**

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, Lambda, Concatenate, Add, BatchNormalization, LeakyReLU

from sklearn.model_selection import KFold

preds = []

n_folds = 10

k_fold = KFold(n_splits=n_folds, random_state=42, shuffle=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = keras.Sequential([
    layers.Dense(128), 
    layers.LeakyReLU(alpha=0.3),
#     layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(64), 
    layers.LeakyReLU(alpha=0.3),
#     layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(32), 
    layers.LeakyReLU(alpha=0.3),
#     layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(16), 
    layers.LeakyReLU(alpha=0.3),
#     layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(8), 
#     layers.LeakyReLU(alpha=0.3),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(4), 
#     layers.LeakyReLU(alpha=0.3),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(2), 
    layers.LeakyReLU(alpha=0.3),
#     layers.BatchNormalization(),
#     layers.Dropout(rate=0.3),
    layers.Dense(1)
   ])

    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
                    initial_learning_rate=0.0001,
                    decay_steps=1000,
                    decay_rate=0.9)
    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(
    optimizer=opt,
    loss="MSE",
    metrics=[keras.metrics.RootMeanSquaredError()],
)
    early_stopping = keras.callbacks.EarlyStopping(
        patience=30,
        min_delta=0.01,
        restore_best_weights=True,
)
    history = model.fit(
          X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=512,
          epochs=500,
          callbacks=[early_stopping],
          verbose=1,
         )
    
    preds.append(model.predict(X_test))



In [None]:
# history = model.fit(
#           X_train, y_train,
#           validation_data=(X_valid, y_valid),
#           batch_size=128,
#           epochs=300,
#           callbacks=[early_stopping],
#           verbose=1,
#          )


In [None]:
# history_df = pd.DataFrame(history.history)
# history_df.loc[1:, ['loss', 'val_loss']].plot()
# history_df.loc[1:, ['mean_squared_error', 'val_mean_squared_error']].plot()

In [None]:
# pred = model.predict(X_test)

In [None]:
pred = np.average(np.array(preds),axis=0)
pred

**Making submission**

In [None]:
submission['MedHouseVal'] = pred
submission

In [None]:
submission.to_csv('submission.csv', index=False)