In [None]:

#https://www.kaggle.com/competitions/playground-series-s3e1
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.datasets import fetch_california_housing

In [None]:
DATAPATH = Path('../input/playground-series-s3e1')

train_set = pd.read_csv(DATAPATH/'train.csv')
test_set = pd.read_csv(DATAPATH/'test.csv')
sample_sub = pd.read_csv(DATAPATH/'sample_submission.csv')

In [None]:
print('train_set shape: ', train_set.shape)
print('test_set shape: ', test_set.shape)
train_set.head()

In [None]:
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
target = ['MedHouseVal']
original = fetch_california_housing()
assert original['feature_names'] == list(features)
assert original['target_names'][0] == target[0]

df_original = pd.DataFrame(original['data'], columns=features)
df_original[target[0]] = original['target']

In [None]:
import lightgbm as lgbm
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor


In [None]:
df_original['is_generated'] = 0
train_set['is_generated'] = 1
test_set['is_generated'] = 1
def rt_crds(df): 
    
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                      (np.sin(np.radians(15)) * df['Latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) - \
                      (np.sin(np.radians(15)) * df['Longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                      (np.sin(np.radians(30)) * df['Latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) - \
                      (np.sin(np.radians(30)) * df['Longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + \
                      (np.sin(np.radians(45)) * df['Latitude'])
    
    df['rot_45_y'] = (np.cos(np.radians(45)) * df['Latitude']) - \
                      (np.sin(np.radians(45)) * df['Longitude'])
    
    
    

    return df




df_original = rt_crds(df_original)
train_set = rt_crds(train_set)
test_set = rt_crds(test_set)




In [None]:
%pip install reverse_geocoder

In [None]:
train_concat = pd.concat([train_set.drop('id', axis=1), df_original]).reset_index(drop=True)


import reverse_geocoder as rg
from haversine import haversine
def add_more(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    df['place'] = [x['admin2'] for x in results]

    places = ['Los Angeles County', 'Orange County', 'Kern County',
              'Alameda County', 'San Francisco County', 'Ventura County',
              'Santa Clara County', 'Fresno County', 'Santa Barbara County',
              'Contra Costa County', 'Yolo County', 'Monterey County',
              'Riverside County', 'Napa County']

    def replace(x):
        if x in places:
            return x
        else:
            return 'Other'

    df['place'] = df['place'].apply(lambda x: replace(x))
    le = LabelEncoder()
    df['place'] = le.fit_transform(df['place'])
    SC = (38.576931, -121.494949)
    SF = (37.780080, -122.420160)
    SJ = (37.334789, -121.888138)
    LA = (34.052235, -118.243683)
    SD = (32.715759, -117.163818)

    df['dist_SC'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SC, unit='ft'), axis=1)
    df['dist_SF'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SF, unit='ft'), axis=1)
    df['dist_SJ'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SJ, unit='ft'), axis=1)
    df['dist_LA'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), LA, unit='ft'), axis=1)
    df['dist_SD'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SD, unit='ft'), axis=1)
    df['dist_nearest_city'] = df[['dist_SC', 'dist_SF', 'dist_SJ', 
                                  'dist_LA', 'dist_SD']].min(axis=1)
    from shapely.geometry import LineString, Point

    coast_points = LineString([(32.664, -117.161), (33.206, -117.383),
                               (33.777, -118.202), (34.463, -120.014),
                               (35.427, -120.881), (35.928, -121.489),
                               (36.982, -122.028), (37.611, -122.491),
                               (38.355, -123.060), (39.792, -123.821),
                               (40.799, -124.188), (41.755, -124.197)])

    df['dist_to_coast'] = df.apply(lambda x: Point(x['Latitude'], x['Longitude']).distance(coast_points), axis=1)

    return df
train_concat_=(train_concat)
test_set_=(test_set)

In [None]:

features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'is_generated', 'rot_15_x',
       'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'rot_45_y', #'place',
      # 'dist_SC', 'dist_SF', 'dist_SJ', 'dist_LA', 'dist_SD',
     #  'dist_nearest_city', 'dist_to_coast'
           ]
target = ['MedHouseVal']
scaler = MinMaxScaler().fit(train_concat_[features])
train_scaled = scaler.transform(train_concat_[features])
test_scaled =scaler.transform(test_set_[features])



train_concat.head(10)




In [None]:

import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=15, random_state=1, shuffle=True)
clfs = []
err = []

for i, (train_index, val_index) in enumerate(kf.split(train_concat)):
    X_train, X_val = train_scaled[train_index, :], train_scaled[val_index, :]
    y_train, y_val = train_concat['MedHouseVal'][train_index], train_concat['MedHouseVal'][val_index]
    
    clf = XGBRegressor(n_estimators=20000,
                       max_depth=7,
                       learning_rate=0.01,
                       colsample_bytree=0.66,
                       subsample=0.9,
                       min_child_weight=22,
                       reg_lambda=16,
                       tree_method='gpu_hist',
                       seed=1,
                      job=-1)
    
    clf.fit(X_train, y_train,
            early_stopping_rounds=100, 
            eval_set=[(X_val, y_val)], 
            verbose=1000)
    
    preds = clf.predict(X_val)
    
    rmse = mean_squared_error(y_val, preds, squared=False)
    err.append(rmse)
    clfs.append(clf)
    print(f'RMSE on fold {i+1}: {rmse}')
    print('-'*50)

In [None]:
err = []

for i, (train_index, val_index) in enumerate(kf.split(train_concat)):
    X_train, X_val = train_scaled[train_index, :], train_scaled[val_index, :]
    y_train, y_val = train_concat['MedHouseVal'][train_index], train_concat['MedHouseVal'][val_index]
    
    clf = lgbm.LGBMRegressor(learning_rate=0.01,
                             max_depth=7,
                             num_leaves=90,
                             colsample_bytree=0.8,
                             subsample=0.9,
                             subsample_freq=5,
                             min_child_samples=36,
                             reg_lambda=28,
                             n_estimators=20000,
                             metric='rmse',
                             random_state=1)
    
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgbm.early_stopping(100, verbose=True)])
    preds = clf.predict(X_val)
    
    rmse = mean_squared_error(y_val, preds, squared=False)
    err.append(rmse)
    clfs.append(clf)
    print(f'RMSE on fold {i+1}: {rmse}')
    print('-'*50)

print(f'Average RMSE (five fold): {sum(err)/10}')

In [None]:
err = []

for i, (train_index, val_index) in enumerate(kf.split(train_concat)):
    X_train, X_val = train_scaled[train_index, :], train_scaled[val_index, :]
    y_train, y_val = train_concat['MedHouseVal'][train_index], train_concat['MedHouseVal'][val_index]
    
    clf = CatBoostRegressor(iterations=20000,
                            depth=7,
                            learning_rate=0.01,
                            rsm=0.88,
                            subsample=0.795,
                            min_data_in_leaf=35,
                            l2_leaf_reg=8,
                            random_strength=0.63,
                            bootstrap_type='Bernoulli',
                            grow_policy='SymmetricTree',
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            task_type="CPU",
                            random_state=1)
    
    clf.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=1000)
    preds = clf.predict(X_val)
    
    rmse = mean_squared_error(y_val, preds, squared=False)
    err.append(rmse)
    clfs.append(clf)
    print(f'RMSE on fold {i+1}: {rmse}')
    print('-'*50)

print(f'Average RMSE (five fold): {sum(err)/10}')

In [None]:
test_preds1 = []
test_preds2 = []
test_preds3 = []

for clf in clfs[:10]:
    preds = clf.predict(test_scaled)
    test_preds1.append(preds)
    
for clf in clfs[10:20]:
    preds = clf.predict(test_scaled)
    test_preds2.append(preds)
    
for clf in clfs[20:]:
    preds = clf.predict(test_scaled)
    test_preds3.append(preds)

In [None]:
test_preds1 = np.stack(test_preds1).mean(0)
test_preds2 = np.stack(test_preds2).mean(0)
test_preds3 = np.stack(test_preds3).mean(0)
test_preds = test_preds1*0.45 + test_preds2*0.35 + test_preds3*0.2

In [None]:
submission = pd.DataFrame(data={'id': test_set.id, 'MedHouseVal': test_preds})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)