In [6]:
# linear: https://stackabuse.com/linear-regression-in-python-with-scikit-learn/
# KNN: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html
# MLP: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

# we are predicting BIRTH WEIGHT (DBWT)

# Setup and imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# CROP DOWN THE DATA FOR TESTING...
from numpy.random import RandomState
rng = RandomState()

train = pd.read_csv("../data/us_births_train.csv").sample(frac=0.01, random_state=rng)
print(train.shape)
test = pd.read_csv("../data/us_births_test.csv").sample(frac=0.01, random_state=rng)
print(test.shape)

(3041227, 16)
(760307, 16)


In [7]:
# Seperate test/train data into X and y (input and output)
X_train = train.drop('DBWT', axis=1) 
y_train = train['DBWT']
X_test = test.drop('DBWT', axis=1)
y_test = test['DBWT']

In [None]:
# TRAIN the model
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
neigh.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = neigh.predict(X_test)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.head()

In [None]:
# get some metrics
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# plot predictions vs expected
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()