In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![Image](http://https://www.realsimple.com/thmb/5GPNGPLNH228wa7jUJ2HPpaRBao=/750x0/filters:no_upscale():max_bytes(150000):strip_icc():format(webp)/red-wine-health-benefits-ce3be96b730b41cc82f128abb75c2395.jpg)

# Introduction

**What to Expect?**
** **
In this notebook, I will explore the Spanish Wine Quality Dataset and fit a regression model on the price column. I will use Scikit-Learn, Pandas, Numpy, Seaborn and Matplotlib.pyplot in this notebook



**Attribute Information**
** **
winery: Winery name
wine: Name of the wine
year: Year in which the grapes were harvested
rating: Average rating given to the wine by the users [from 1-5]
num_reviews: Number of users that reviewed the wine
country: Country of origin [Spain]
region: Region of the wine
price: Price in euros [€]
type: Wine variety
body: Body score, defined as the richness and weight of the wine in your mouth [from 1-5]
acidity: Acidity score, defined as wine's “pucker” or tartness; it's what makes a wine refreshing and your tongue salivate and want another sip [from 1-5]

**If you are not understanding any line of my code just copy paste on chatgpt to gat a detailed understanding on that lne of code.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing

In [None]:
df = pd.read_csv('../input/spanish-wine-quality-dataset/wines_SPA.csv')
df.head()

In [None]:
df.shape

**so we have**
* **rows-7500**
* **columns-11**

In [None]:
df.info()

As we see that there are some operations is needed in null values and the Dtype also 

Statistical Analysis

In [None]:
df.describe()

In [None]:
round(100*(df.isnull().sum()/df.shape[0]), 2)


**Lets first see our columns in detailed manner**

In [None]:
def val(x):
  for column in x.columns:
    print("{}:{},".format(column,x[column].unique()))
    print("        ")
    print("        ")


In [None]:
val(df)

**As you can see we are able to see all columns in detail with there unique values.
So now we can do operations as per our requirement.**

In [None]:
#lets first drop all null values
df=df.dropna()

In [None]:
df.info()

In [None]:
#Now as we see above in year column we have one unique value as "N.V." lets replace it with nan and drop
#it with dropna 
df["year"]=df["year"].replace("N.V.",np.NaN)
df=df.dropna()
df['year'] = df['year'].astype(np.int64)
print(df.year.unique())

**As we see in the above step we first converted that N.V. value to na and then dropped it.
After that we converted our str to int**

In [None]:
#Country column is not important so lets drop it
df=df.drop("country",axis=1)
df.head()

# Data Visualization

**Lets see how the attributes are correlated with price**

In [None]:
df2=df.drop(["winery","wine","region","type"],axis=1)
df2.head()

In [None]:
sns.heatmap(df2.corr(),annot=True,cmap="Blues")

**Oooh seems like most of our numerical variables does not have much of a correlation on the price column except for the rating that has a weak to moderate positive correlation. The price and rating column has a positive correlation which means that when the rating is high, its more likely that the price is also high, which make sense (but not in all cases).**

**Does the type of the wine affects the wines price?**

In [None]:
#lets check
plt.bar(df["type"],df["price"])
plt.xticks(rotation=90)
plt.xlabel("Type of wine")
plt.ylabel("Price in euros [€]")
plt.title("price VS Type ")
plt.show()

**As we see some has high price and some doesnt so it means that wines price vary depending on the type of wine.**

# Data Modeling

In [None]:
from sklearn.preprocessing import LabelEncoder

print("categorical Variables:")
for col in df.columns:
    if df[col].dtype=="object":
        print(str(col))
        label=LabelEncoder()
        label=label.fit(df[col])
        df[col]=label.transform(df[col].astype(str))

In [None]:
df.info()

In [None]:
df.head()

**Now every thing is converted into numerical values so we can standardization here**

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the DataFrame
df_standardized = scaler.fit_transform(df)

# Convert the result back to a DataFrame (optional)
df_std = pd.DataFrame(df_standardized, columns=df.columns)
df_std.head()


# Training Model

In [None]:
X=df_std.drop("price",axis=1)
y=df_std[["price"]]
y = y.values.ravel()


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
df_std.info()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
y.shape

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train regression models
models = [
    ("Linear Regression", LinearRegression()),
    ("Lasso Regression", Lasso(alpha=0.01)),
    ("Ridge Regression", Ridge(alpha=1.0)),
    ("Bayesian Ridge", BayesianRidge()),
    ("Decision Tree Regressor", DecisionTreeRegressor(random_state=42)),
    ("Linear SVR", LinearSVR()),
    ("K-Nearest Neighbors Regressor", KNeighborsRegressor(n_neighbors=5)),
    ("Random Forest Regressor", RandomForestRegressor(n_estimators=100, random_state=42))
]

# Initialize a list to store results
results = []

# Fit and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append([name, mse, r2])

# Create a DataFrame from the results list
results_df = pd.DataFrame(results, columns=["Model", "Mean Squared Error", "R-squared"])

# Find the model with the lowest MSE and highest R-squared
best_mse_model = results_df.loc[results_df["Mean Squared Error"].idxmin()]
best_r2_model = results_df.loc[results_df["R-squared"].idxmax()]

print("Model Evaluation Results:")
print(results_df)
print("\nBest model based on MSE:")
print(best_mse_model)
print("\nBest model based on R-squared:")
print(best_r2_model)


**As you can see above that Best model based on MSE: and Best model based on R-squared:**

**we can pick as per our requirement**

In [None]:
plt.figure(figsize=(110,50))
plt.plot(y_pred,label='predicted')
plt.plot(y_test,label='Actual')
plt.legend(fontsize='large')
plt.show()

In [None]:
sns.heatmap(df_std.corr(),annot=True,cmap="Blues")

**As you can see the predicted values are close to actual values and some places it isnt due to our attributes or variables are not much correlated to the price**

# Conclusion

**The model does it job but not particularly good nor bad. But personally its kinda predictable that our model would do bad as how we saw that most columns has a very little to no relationship toward to the wines prices.**

**Surprisingly though, our model did alright at predicting low prices wines but did terrible at high prices wines, I think what caused this from happening according to our EDA earlier, that in our dataset, theres way more data on low prices wines but theres a little data from the high price wines.**

# Authors Message
* If you find this helpful, I would really appreciate the upvote!
* If you see something wrong please let me know.
* And lastly Im happy to hear your thoughts about the notebook for me to also improve!