In [None]:
# Import.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setup plotting params.
plt.rcParams['figure.figsize'] = (14, 6)
sns.set(style="whitegrid")

%matplotlib inline

In [None]:
# Constants.
DATA_PATH  = "../data/"
ASTN_FILE = DATA_PATH + "table1.csv"
SPECTRA_PATH = "../data/spectra/"
COL_NAMES = ["wavelength", "flux", "flux_err"]

# NOTE. We're arbitrarily defining a wing range. In the future we might want to estimate this
#       in some ingenious way.
W_LIMS = [[6555, 6561], [6567, 6573]]

# File list.
data_astn = pd.read_csv(ASTN_FILE, sep=',')

In [None]:
# --+ Fit function.
# Get two polynomial fits from the data in a pandas dataframe, one for each wing.
#   df: Pandas dataframe with three columns: wavelength, flux, and flux_err.
# NOTE. Currently does not return anything about the fit quality.
def get_fits(df):
    # Array to contain the results of the polynomial fits.
    fits  = []

    for wi in range(2):
        # Dictionary to contain the wavelength, flux, and flux error.
        wings = {}
        # Extract values
        for col in COL_NAMES:
            wings[col] = \
                df[col][(df["wavelength"] >= W_LIMS[wi][0]) & (df["wavelength"] <= W_LIMS[wi][1])].values

        # The errors are stddev and numpy takes in variance, so we square the values in that array.
        wings["flux_err"] = np.square(wings["flux_err"])

        # Fit the arrays.
        fits.append(np.poly1d(np.polynomial.polynomial.polyfit(
            wings["wavelength"],   # x.
            wings["flux"],         # y.
            2,                     # degree.
            w = wings["flux_err"], # y_err.
            rcond=None, full=False
        )[::-1]))

    return fits

In [None]:
# --+ Fit one random file and plot.
# Extract a random filename from the csv.
TESTFILE = SPECTRA_PATH + data_astn["filename"].sample(n=1).values[0]

# Create a dataframe with the txt file.
df = pd.read_csv(TESTFILE, sep="\s+", header=None, names=COL_NAMES)

# Perform fits.
fits = get_fits(df)

# Remove all data from df outside of the relevant region.
df.drop(df[(
    ((W_LIMS[0][0] > df["wavelength"]) | (df["wavelength"] > W_LIMS[0][1])) &
    ((W_LIMS[1][0] > df["wavelength"]) | (df["wavelength"] > W_LIMS[1][1]))
)].index, inplace=True)

# Plot df.
plt.errorbar(
    df["wavelength"], df["flux"], yerr=df["flux_err"],
    fmt = 'o', markersize=0.4, capsize=0.2, color="gray"
)

# Get two linspaces to plot the fit.
for wi in range(2):
    x_vals = np.linspace(W_LIMS[wi][0], W_LIMS[wi][1], 100)
    y_vals = fits[wi](x_vals)
    plt.plot(x_vals, y_vals)

plt.show()

In [None]:
# --+ Get and store the fits and the target temperature for all files.
# Create the output dataframe.
columns = ["filename", "w1_x2", "w1_x1", "w1_c", "w2_x2", "w2_x1", "w2_c", "temperature"]
out_df = pd.DataFrame(columns=columns)

# Iterate through all files.
for in_row in data_astn.iterrows():
    # Get the pandas dataframe.
    in_df = pd.read_csv(
        SPECTRA_PATH + in_row[1]["filename"], sep="\s+", header=None, names=COL_NAMES
    )

    # Extract the fits.
    fits = get_fits(in_df)

    # Form the row.
    out_row = {
        "filename"    : in_row[1]["filename"],
        "w1_x2"       : fits[0][0],
        "w1_x1"       : fits[0][1],
        "w1_c"        : fits[0][2],
        "w2_x2"       : fits[1][0],
        "w2_x1"       : fits[1][1],
        "w2_c"        : fits[1][2],
        "temperature" : in_row[1]["temperature"]
    }

    # Append row to output dataframe.
    out_df.loc[in_row[0]] = out_row

In [None]:
# --+ Study the data a bit.
sns.pairplot(out_df)

# Very high correlation between fit parameters. This doesn't look normal, should
#     try other fitting methods. Maybe Chebyshev polynomials?

In [None]:
# --+ Prepare X_train, X_test, y_train, y_test.
X = out_df[["w1_x2", "w1_x1", "w1_c", "w2_x2", "w2_x1", "w2_c"]]
y = out_df["temperature"]

# Scale X and y. This is not necessary for linear regression, but might as well
#     do it now instead of forgetting to do it if we try more complex models in
#     the future. Also this scaling method is pretty bad, should use MinMax.
X_scalers = [x.max() if x.max() > abs(x.min()) else abs(x.min()) for _, x in X.items()]
y_scaler  = y.max() # y is strictly positive.

X_scaled = X / X_scalers
y_scaled = y / y_scaler

# Split of 80:20.
cutoff = int(.80 * X_scaled.shape[0])

X_train = X_scaled[:cutoff]
X_test  = X_scaled[cutoff:]
y_train = y_scaled[:cutoff]
y_test  = y_scaled[cutoff:]

In [None]:
# Train lr models with different number of polynomial features.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

r2_scores = {}
y_preds   = {}

for d in range(1, 11):
    # Get polynomial features.
    p2r = PolynomialFeatures(degree = d)
    X_train_poly = p2r.fit_transform(X_train)
    X_test_poly  = p2r.transform(X_test)

    # Fit and predict w/ linear regression.
    lr = LinearRegression()
    lr.fit(X_train_poly, y_train)

    y_preds[d]   = lr.predict(X_test_poly)
    r2_scores[d] = r2_score(y_preds[d], y_test)

In [None]:
# Negative R2 scores make sense considering the high degree of the polynomial
#     features. Suggests that a dense network might give good results, but we'll
#     need more data.
r2_scores

In [None]:
palette = sns.color_palette("pastel")

for d in range(1, 11):
    sns.lineplot(y_scaler * y_test.values, color = palette[0], label = "Real values")
    sns.lineplot(y_scaler * y_preds[d],    color = palette[1], label = "Polyfit (%d)" % d)
    plt.ylim(5000, 7500)

    # Labels and such.
    plt.ylabel("T_eff")
    plt.legend(loc = "upper right")
    
    plt.show()