In [7]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder

# Read the scaled data into a pandas DataFrame
df = pd.read_csv('scaled.csv')

# Drop the first unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Get dummy variables for categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols)

# Separate features and target variable
X = df.drop('avg_salary', axis=1)
y = df['avg_salary']

# Fit Lasso regression
lasso = Lasso(alpha=0.001)  # You can adjust alpha as needed
lasso.fit(X, y)
print(lasso)

# Get indices of features with nonzero coefficients
nonzero_coef_indices = lasso.coef_ != 0

# Get the column names with nonzero coefficients
selected_features = X.columns[nonzero_coef_indices]

# Create a DataFrame with only features that had nonzero coefficients in Lasso
lasso_features_df = X[selected_features]

# Print the number of features with nonzero coefficients
print("Number of features with nonzero coefficients:", len(selected_features))


Lasso(alpha=0.001)
Number of features with nonzero coefficients: 56


In [8]:
# Export the DataFrame to a CSV file
lasso_features_df.to_csv('LASSO_features.csv', index=False)