In [124]:
import os
import numpy as np 
import pandas as pd

In [125]:
X_train = pd.read_csv("data/train.genotype.txt", sep = " ", header=None).values
y_train = pd.read_csv("data/train.phenotype.txt", sep = " ", header=None).values
X_test = pd.read_csv("data/test.genotype.txt", sep = " ", header=None).values

In [126]:
X_train

array([[0, 2, 1, ..., 0, 0, 2],
       [0, 1, 0, ..., 0, 0, 2],
       [0, 1, 2, ..., 0, 0, 1],
       ...,
       [0, 2, 1, ..., 0, 0, 1],
       [0, 1, 2, ..., 0, 0, 1],
       [0, 2, 2, ..., 1, 0, 2]])

In [127]:
y_train

array([[-1.445386],
       [-0.627935],
       [-1.013429],
       ...,
       [ 0.349022],
       [ 0.789777],
       [-0.25892 ]])

In [128]:
X_test

array([[0, 2, 1, ..., 0, 0, 1],
       [0, 2, 1, ..., 0, 0, 2],
       [0, 1, 1, ..., 1, 0, 2],
       ...,
       [0, 2, 2, ..., 0, 0, 2],
       [0, 1, 2, ..., 1, 0, 1],
       [0, 2, 1, ..., 0, 0, 2]])

# cost function

In [129]:
def cost(y_true, y_hat):
    mse = -np.log10(np.mean((y_true-y_hat)**2)+1e-5)
    return(mse)

# baseline

In [130]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

# X_train = sm.add_constant(X_train)
# X_test = sm.add_constant(X_test)

# Find significant features that fit linear model
model = sm.OLS(y_train, X_train)
results_linear = model.fit()
significant_features_linear = results_linear.pvalues < 0.005
indices_of_significant_features_linear = np.where(significant_features_linear)[0]

print (indices_of_significant_features_linear)

# Transform X data in preparation for quadratic regression
# X_train_quad = np.column_stack((X_train, X_train ** 2))
# X_test_quad = np.column_stack((X_test, X_test ** 2))
X_train_quad = X_train ** 3
X_test_quad = X_test ** 3

# Find significant features that fit quadratic model
model = sm.OLS(y_train, X_train_quad)
results_quad = model.fit()
significant_features_quad = results_quad.pvalues < 0.005
indices_of_significant_features_quad = np.where(significant_features_quad)[0]

print (indices_of_significant_features_quad)

# # Filter out features that are both significant in linear and quadratic model
# new_indices_of_significant_features_linear = indices_of_significant_features_linear
# for val in indices_of_significant_features_linear:
#     if val in indices_of_significant_features_quad:
#         new_indices_of_significant_features_linear = new_indices_of_significant_features_linear[new_indices_of_significant_features_linear != val]
# indices_of_significant_features_linear = new_indices_of_significant_features_linear

# print (indices_of_significant_features_linear)

# Filter out features that are both significant in linear and quadratic model
new_indices_of_significant_features_quad = indices_of_significant_features_quad
for val in indices_of_significant_features_quad:
    if val in indices_of_significant_features_linear:
        new_indices_of_significant_features_quad = new_indices_of_significant_features_quad[new_indices_of_significant_features_quad != val]
indices_of_significant_features_quad = new_indices_of_significant_features_quad

print (indices_of_significant_features_quad)

# Ensure no overlapping features
assert len(np.intersect1d(indices_of_significant_features_linear, indices_of_significant_features_quad)) == 0

# Get features based on significance in linear model
X_train_significant_linear = X_train[:, indices_of_significant_features_linear]
X_test_significant_linear = X_test[:, indices_of_significant_features_linear]

# Get features based on significance in quadratic model
X_train_significant_quad = X_train_quad[:, indices_of_significant_features_quad]
X_test_significant_quad = X_test_quad[:, indices_of_significant_features_quad]


[  0   6  22  54  59 131 148 155 177]
[  6  22  31  54  59 131 177]
[31]


In [131]:
"""Testing code block."""

X = np.array([[1, 2, 3], [4, 5, 6]])  # Linear term
X_squared = X**2        # Quadratic term

# Using np.column_stack to combine linear and quadratic terms
X_combined = X_squared
print (X_combined)

# print(X_combined)

[[ 1  4  9]
 [16 25 36]]


In [132]:
# # Go through nonsignificant features and transform to include interaction term
# indices_of_nonsignificant_features = [i for i in range(len(X_train[0])) if i not in indices_of_significant_features_quad and i not in indices_of_significant_features_linear]
# indices_of_nonsignificant_features = [i for i in range(len(X_train[0]))]
# X_train_nonsignificant = X_train[:, indices_of_nonsignificant_features]
# X_test_nonsignificant = X_test[:, indices_of_nonsignificant_features]

# poly = PolynomialFeatures(interaction_only=True, include_bias=False)
# X_train_nonsignificant_interaction = poly.fit_transform(X_train_nonsignificant)
# X_test_nonsignificant_interaction = poly.fit_transform(X_test_nonsignificant)

# print (X_train_nonsignificant_interaction[0])

# model = sm.OLS(y_train, X_train_nonsignificant_interaction)
# results_interaction = model.fit()
# significant_features_interaction = results_interaction.pvalues < 0.020
# indices_of_significant_features_interaction = np.where(significant_features_interaction)[0]

# X_train_significant_interaction = X_train_nonsignificant_interaction[:, indices_of_significant_features_interaction]
# X_test_significant_interaction = X_test_nonsignificant_interaction[:, indices_of_significant_features_interaction]


In [133]:
# Combine features in preparation for retraining
# X_train_significant = np.column_stack((X_train_significant_linear, X_train_significant_quad, X_train_significant_interaction))
# X_test_significant = np.column_stack((X_test_significant_linear, X_test_significant_quad, X_test_significant_interaction))

X_train_significant = sm.add_constant(X_train_significant)
X_test_significant = sm.add_constant(X_test_significant)

X_train_significant = np.column_stack((X_train_significant_linear, X_train_significant_quad))
X_test_significant = np.column_stack((X_test_significant_linear, X_test_significant_quad))

# Train model based on features significant in both linear and quadratic model
model_significant = sm.OLS(y_train, X_train_significant)
results_significant = model_significant.fit()

# Make predictions using these new significant features
y_pred_significant = results_significant.predict(X_test_significant)

y_test = y_pred_significant

# save and zip the file

In [134]:
pd.DataFrame(y_test).to_csv(f"predictions.csv", sep = " ", header = None, index = None)
os.system("zip -r predictions.zip predictions.csv")

updating: predictions.csv (deflated 69%)


0