In [17]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score
from scipy.stats import zscore

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [18]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [19]:
train.drop(columns=['ID'],inplace=True)
testID = test.ID
test.drop(columns=['ID'],inplace=True)

In [20]:
train.dropna(inplace=True)

In [21]:
# Converting string columns to numeric
labelencoder = LabelEncoder()
for feature in ['HISPDAD' , 'HISPMOM']:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)
    
    train[feature].fillna(train[feature].mode(), inplace = True)
    test[feature].fillna(train[feature].mode(), inplace = True)

    labelencoder.fit(train[feature].append(test[feature]))

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

In [22]:
# fill missing values with mean column values
train.fillna(train.mean(), inplace=True)

In [23]:
# split into X and y
y = train['BWEIGHT']
train.drop(['BWEIGHT'], axis=1, inplace=True)

In [24]:
# zscaling
train=train.apply(zscore)
test=test.apply(zscore)

In [25]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [26]:
# Define models
models = [
    # LinearRegression(),
    # MLPRegressor(verbose=10),
    # RandomForestRegressor(),
    # ExtraTreesRegressor(),
    # GradientBoostingRegressor(),
    CatBoostRegressor(verbose = 10,
                        iterations = 1000,
                        learning_rate = 0.1,
                        depth = 6,
                        l2_leaf_reg = 10,
                        loss_function = 'RMSE',
                        eval_metric = 'RMSE',
                        random_seed = 42,
                        od_type = 'Iter',
                        od_wait = 50,
                        allow_writing_files = False)
    

]

In [None]:
# Evaluate models
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("r2 score: %.2f" % r2_score(y_test, y_pred))
    print("\n")

In [None]:
# # RNN model
# from keras.models import Sequential
# from keras.layers import Dense, LSTM, Dropout


# # define model
# model = Sequential()
# model.add(LSTM(50, input_shape=(1, 35)))
# model.add(Dropout(0.2))
# model.add(Dense(1))
# model.compile(loss='mean_squared_error', optimizer='adam')

# # reshape input to be [samples, time steps, features]
# X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
# X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

# # fit network
# history = model.fit(X_train, y_train, epochs=100, batch_size=72, validation_data=(X_test, y_test), verbose=2, shuffle=False)

# # plot history
# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='test')
# plt.legend()
# plt.show()

In [None]:
model = models[0]
model.fit(train, y)
res = model.predict(test)

In [40]:
# save results to file
results = pd.DataFrame({'ID': testID, 'BWEIGHT': res.T[0]})
filename = "submission.csv"
if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')