In [3]:
#Load the data from winequality.csv
import pandas as pd

# Load the data from the specified CSV file into a pandas DataFrame
df = pd.read_csv('/content/winequality.csv')

# Display the first few rows of the DataFrame to confirm the data is loaded
print(df.head())


   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [6]:
# Lasso regression with an output and an interpretation of the results

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Assume 'quality' is the target variable and the rest are features
X = df.drop('quality', axis=1)
y = df['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Lasso Regressor
# You can experiment with different alpha values (regularization strength)
# A smaller alpha means less regularization
lasso = Lasso(alpha=0.1)

# Fit the model to the training data
lasso.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

# Interpretation of the results
print("\nLasso Regression Coefficients:")
# Get the feature names
feature_names = X.columns

# Get the coefficients from the trained model
coefficients = lasso.coef_

# Create a DataFrame to display coefficients
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort by absolute coefficient value to see which features have the most impact
coef_df['Absolute Coefficient'] = abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values(by='Absolute Coefficient', ascending=False)

print(coef_df)

print("\nInterpretation:")
print("Lasso Regression is used to perform both variable selection and regularization.")
print("It shrinks the coefficients of less important features towards zero.")
print(f"The Mean Squared Error ({mse:.2f}) and Root Mean Squared Error ({rmse:.2f}) indicate the average squared difference and average difference between the predicted and actual quality scores, respectively.")
print("Lower values for MSE and RMSE indicate a better fit of the model to the data.")
print("The coefficients in the table above represent the impact of each feature on the 'quality' variable.")
print("A positive coefficient means that as the value of that feature increases, the predicted quality is expected to increase.")
print("A negative coefficient means that as the value of that feature increases, the predicted quality is expected to decrease.")
print("Coefficients that are exactly zero mean that Lasso has effectively removed that feature from the model, considering it not significant for prediction given the chosen alpha.")
print("The magnitude of the coefficient indicates the strength of the relationship.")
print(f"With alpha = 0.1, some coefficients have been shrunk towards or exactly to zero, indicating that those features have less influence on the wine quality according to this model.")
print("Features with larger absolute coefficients are considered more important predictors of wine quality by this Lasso model.")


Mean Squared Error: 0.50
Root Mean Squared Error: 0.71

Lasso Regression Coefficients:
                 Feature  Coefficient  Absolute Coefficient
10               alcohol     0.250518              0.250518
0          fixed_acidity     0.029804              0.029804
5    free_sulfur_dioxide     0.006538              0.006538
6   total_sulfur_dioxide    -0.004184              0.004184
1       volatile_acidity    -0.000000              0.000000
2            citric_acid     0.000000              0.000000
4              chlorides    -0.000000              0.000000
3         residual_sugar    -0.000000              0.000000
7                density    -0.000000              0.000000
8                     pH    -0.000000              0.000000
9              sulphates     0.000000              0.000000

Interpretation:
Lasso Regression is used to perform both variable selection and regularization.
It shrinks the coefficients of less important features towards zero.
The Mean Squared Error (0.5