In [25]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Assuming X_train, X_test, y_train, y_test are your training and testing data

df = pd.read_csv("assets/train.csv", index_col=0) # First column as row index
columns = df.columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # other popular choices: "median", "most_frequent"
imputer.fit(df.values)
df = imputer.transform(df.values)
df = pd.DataFrame(df, columns=columns)

y = df['Edible']
X = df.drop('Edible', axis=1) 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Get coefficients
coefficients = linear_reg.coef_

# Train Lasso regression model (for feature selection)
lasso_reg = Lasso(alpha=0.1)  # Adjust alpha as needed for regularization strength
lasso_reg.fit(X_train, y_train)

# Get coefficients (some will be zero)
lasso_coefficients = lasso_reg.coef_

# Permutation feature importance
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(linear_reg, X_test, y_test, n_repeats=30)

# Get feature importances
importances = perm_importance.importances_mean

# Now, analyze 'coefficients', 'lasso_coefficients', and 'importances' to identify less important features


for i, index in enumerate(lasso_coefficients):
    if index == 0:
        print(df.columns[i])

Color Intensity (a.u.)
Length (mm)
Luminescence Intensity (a.u.)
Seed Count
Skin Thickness (mm)
Soil pH where Grown
pH
