# Splitting data

# Training the model


# Predicting and evaluating

In [None]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Fetch dataset from UCI Machine Learning Repository
wine_quality = fetch_ucirepo(id=186)

# Extract data and targets
X = wine_quality.data.features
y = wine_quality.data.targets

# Display metadata and variable information
print(wine_quality.metadata)
print(wine_quality.variables)

df = pd.concat([X, y], axis=1)
df.rename(columns={df.columns[-1]: 'quality'}, inplace=True)
    

# Display dataset information and visualizations
def wine_data(df):
    print("Dataset Shape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nFirst few rows:")
    print(df.head())
    print("\nBasic Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

wine_data(df)

 # Correlation matrix visualization
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Wine Features')
plt.tight_layout()
plt.show()

plt.figure(figsize=(15, 10))
for i, column in enumerate(df.columns, 1):
    plt.subplot(4, 3, i)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

# Perform regression to predict wine quality
def linear_regression_analysis(df):
    # Select features and target
    X = df.drop('quality', axis=1)
    y = df['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Data:", mse)

# Plot predictions vs actual values with trend line
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='red')

    # Plot the trend line
y_test_array = np.array(y_test).reshape(-1)
y_pred_array = np.array(y_pred).reshape(-1)
z = np.polyfit(y_test_array, y_pred_array, 1)
p = np.poly1d(z)
plt.plot(y_test_array, p(y_test_array), color='blue')

plt.xlabel('Actual Quality')
plt.ylabel('Predicted Quality')
plt.title('Actual vs Predicted Wine Quality')
plt.show()

linear_regression_analysis(df)