In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

First we tried to predict the mean of the distribution from our small training data set using linear regression. We split the data into 90% train and 10% test and fit a linear regression model. We got an R^2 of less than 0 and a very high RMSE.

In [28]:
from timeit import default_timer as timer

def create_test_train(data_set_path, test_size=0.10):
    """ Splits a given csv file into testing and training. Target column must be y """
    # Make sure the columns are set
    data_set = pd.read_csv(data_set_path)
    
    print(data_set.describe())

    # Shuffle the data
    data_set = data_set.sample(frac=1, random_state=0)
 
    # Select all except y column
    data_set_X = data_set.drop('y', axis=1)
    # Select just y column
    data_set_Y = data_set[['y']]

    #Split into training and test data
    return train_test_split(data_set_X,
                            data_set_Y,
                            test_size=test_size, 
                            random_state=300)

def evaluate_model(model, x, y):
    score = model.score(x, y)
    # Time
    pred_y = model.predict(x)
    mse = mean_squared_error(y, pred_y)   
    return score, mse, np.sqrt(mse)

def time_preds(model, x, count=10):
    """ Find the average runtime of making predictions on the test set"""
    total_runtime = 0
    for _ in range(count):
        start = timer()
        model.predict(x)
        end = timer()
        total_runtime += (end - start)
    return total_runtime / count # Time in seconds


In [32]:
X_train, X_test, y_train, y_test = create_test_train("dust_training_data_small.csv")

# Fit a linear regresison on the training set
fit_linear = LinearRegression().fit(X_train, y_train)

# See how our linear regression fit does
eval_linear = evaluate_model(fit_linear, X_test, y_test)
print(f"R2:{eval_linear[0]}, MSE:{eval_linear[1]}, RMSE:{eval_linear[2]}")

                   R     Mstar          alpha            d2g          sigma  \
count  800132.000000  800132.0  800132.000000  800132.000000  800132.000000   
mean       69.942664       1.0       0.016301       0.185592     698.750248   
std       108.697391       0.0       0.034412       0.362841    1657.814296   
min         0.316228       1.0       0.000010       0.000100       0.152053   
25%         2.792938       1.0       0.000100       0.001000       2.110976   
50%        17.693273       1.0       0.001000       0.010000      22.842133   
75%        86.558651       1.0       0.010000       0.100000     364.214276   
max       500.000000       1.0       0.100000       1.000000    9559.802528   

                Tgas         Bin_0         Bin_1         Bin_2         Bin_3  \
count  800132.000000  8.001320e+05  8.001320e+05  8.001320e+05  8.001320e+05   
mean       42.074224  2.225405e+54  2.723183e+54  3.065637e+54  3.373757e+54   
std        42.523518  1.990615e+57  2.435877e+57

Here we measure the runtime of making predictions on the mean. On average it takes 2.1E-07 to make a single prediction.

In [31]:
runtime = time_preds(fit_linear, X_test)
print(f"{runtime} seconds for {len(X_test)} preds. {runtime/len(X_test)} seconds per prediction")

0.01680419290205464 seconds for 80014 preds. 2.1001565853543927e-07 seconds per prediction


Then we used the data set that took the log transform of all of the density bins and the average y.

In [5]:
X_train, X_test, y_train, y_test = create_test_train("dust_training_data_log_v2.csv")

# Fit our linear regression with log transformed data the training set
fit_log = LinearRegression().fit(X_train, y_train)

# See how our linear regression fit does with the log transformed data
eval_linear_log = evaluate_model(fit_log, X_test, y_test)
print(f"R2:{eval_linear_log[0]}, MSE:{eval_linear_log[1]}, RMSE:{eval_linear_log[2]}")

                   R     Mstar          alpha            d2g          sigma  \
count  800338.000000  800338.0  800338.000000  800338.000000  800338.000000   
mean       69.939112       1.0       0.016297       0.185802     698.571593   
std       108.683875       0.0       0.034409       0.363030    1657.638323   
min         0.316228       1.0       0.000010       0.000100       0.152053   
25%         2.792938       1.0       0.000100       0.001000       2.110976   
50%        17.693273       1.0       0.001000       0.010000      22.842133   
75%        86.558651       1.0       0.010000       0.100000     364.214276   
max       500.000000       1.0       0.100000       1.000000    9559.802528   

                Tgas          Bin_0          Bin_1          Bin_2  \
count  800338.000000  800338.000000  800338.000000  800338.000000   
mean       42.066924     -65.815133     -63.954152     -62.047036   
std        42.520490      94.410072      91.939246      89.493226   
min         

We next tried to fit an elastic net model on the log transformed data. It performed about the same as linear regression.

In [10]:
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.utils.validation import column_or_1d
from sklearn.metrics import mean_squared_error
import numpy as np

# Try cross-validation for selecting
alphas = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]
l1_ratio = [0.01, .1, .5, .7, .9, .95, 1]
fit_elastic_net = ElasticNetCV(cv=10, random_state=0, normalize=True, alphas=alphas, l1_ratio=l1_ratio, max_iter=100000)

# Fit our model
fit_elastic_net.fit(X_train, column_or_1d(y_train))

# Get the evaluation metrics
eval_elastic_net = evaluate_model(fit_elastic_net, X_test, y_test)
print(f"Alpha:{fit_elastic_net.alpha_} L1: {fit_elastic_net.l1_ratio_} R2:{eval_elastic_net[0]}, MSE:{eval_elastic_net[1]}, RMSE:{eval_elastic_net[2]}")




Alpha:0.001 L1: 1.0 R2:0.7345378901326389, MSE:190.53321265752945, RMSE:13.803376857042245
