In [37]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_log_error
from scipy.stats import zscore

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [38]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [39]:
train.drop(columns=['ID'],inplace=True)
testID = test.ID
test.drop(columns=['ID'],inplace=True)

In [36]:
# train.dropna(inplace=True)

In [40]:
# Converting string columns to numeric
labelencoder = LabelEncoder()
for feature in ['HISPDAD' , 'HISPMOM']:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)
    
    train[feature].fillna(train[feature].mode(), inplace = True)
    test[feature].fillna(train[feature].mode(), inplace = True)

    labelencoder.fit(train[feature].append(test[feature]))

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

In [41]:
# fill missing values with mean column values
train.fillna(train.mean(), inplace=True)

In [42]:
# split into X and y
y = train['BWEIGHT']
train.drop(['BWEIGHT'], axis=1, inplace=True)

In [43]:
# zscaling
train=train.apply(zscore)
test=test.apply(zscore)

In [47]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [48]:
# Define models
models = [
    LinearRegression(),
    MLPRegressor(verbose=10),
    RandomForestRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor(),
    CatBoostRegressor(eval_metric = 'RMSE',verbose = 10)

]

In [49]:
min(y_pred)

-1.2830745884642256

In [50]:
# Evaluate models
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("score: %.2f" % mean_squared_log_error(y_test, y_pred))
    print("\n")

Model: LinearRegression
score: 0.02


Iteration 1, loss = 4.91548529
Iteration 2, loss = 0.75388528
Iteration 3, loss = 0.58822774
Iteration 4, loss = 0.53930196
Iteration 5, loss = 0.52058706
Iteration 6, loss = 0.51045914
Iteration 7, loss = 0.50421848
Iteration 8, loss = 0.50032930
Iteration 9, loss = 0.49763952
Iteration 10, loss = 0.49475167
Iteration 11, loss = 0.49202311
Iteration 12, loss = 0.49016841
Iteration 13, loss = 0.48941550
Iteration 14, loss = 0.48911395
Iteration 15, loss = 0.48788558
Iteration 16, loss = 0.48676754
Iteration 17, loss = 0.48512560
Iteration 18, loss = 0.48503729
Iteration 19, loss = 0.48593938
Iteration 20, loss = 0.48249473
Iteration 21, loss = 0.48189793
Iteration 22, loss = 0.48232422
Iteration 23, loss = 0.48131759
Iteration 24, loss = 0.47999033
Iteration 25, loss = 0.48031975
Iteration 26, loss = 0.47983622
Iteration 27, loss = 0.48054395
Iteration 28, loss = 0.47919114


In [None]:
model = models[0]
model.fit(train, y)
res = model.predict(test)

In [None]:
# save results to file
results = pd.DataFrame({'ID': testID, 'BWEIGHT': res})
filename = "submission.csv"
if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')