In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
wine_data = pd.read_csv('wine_quality.csv')

wine_data.info()
wine_data_new = wine_data


In [32]:
winequality_mean = round(np.mean(wine_data_new['quality']),3)
print("Mean ",winequality_mean)

Mean  5.657


In [33]:
winequality_median = round(np.median(wine_data['quality']),3)
print("Median ",winequality_median)

Median  6.0


In [None]:
#variance
print(round(np.var(wine_data_new['quality']),3))

#The moderate variance value implies that while there is some variability in wine quality ratings, the values are not extremely dispersed.
#Understanding the variance helps in grasping how spread out the wine quality ratings are.
print(round(wine_data_new['quality'].var(),4))

#standard deviation
print(round(np.std(wine_data_new["quality"], ddof=1), 4))
#A standard deviation of 0.8058 suggests that the quality ratings of wines in this dataset typically vary by about 0.8058 points from the average wine quality rating.


0.649
0.6494
0.8058


In [None]:
#range
print(np.max(wine_data_new['quality']))
print(np.min(wine_data_new['quality']))
#The maximum value of the ‘quality’ variable is 8, indicating the highest rating given to any wine in the dataset.
#The minimum value is 3, showing the lowest rating in the dataset.

8
3


In [None]:
#quartiles
Q1 = np.quantile(wine_data_new['quality'],0.25)
Q3 = np.quantile(wine_data_new['quality'],0.75)

IQR = Q3-Q1

print(Q1,Q3,IQR)

#Q1: This is the value below which 25% of the data falls. In this case, 25% of the wines have a quality rating of 5.0 or lower.
#Q3: This value indicates that 75% of the wines have a quality rating of 6.0 or lower. It represents the upper quartile of the distribution.
#The IQR of 1.0 suggests that the middle 50% of the wine quality ratings are within a narrow range (5.0 to 6.0), indicating a lack of wide variability in the middle half of the dataset.

5.0 6.0 1.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#correlation maxtrix
correlation_matrix = wine_data_new.corr()


#Plotting heatmap
plt.figure(figsize=(12, 8))  # Set the size of the figure
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

plt.title('Correlation Matrix for Wine Dataset')
plt.show()

In [44]:
#Handling Missing values
wine_data_new.fillna(wine_data.mean(),inplace=True)

#Encoding categorical variables if any
encoder = LabelEncoder()
wine_data_new['categorical_column'] = encoder.fit_transform(wine_data['quality'])

#Normalizing features
scaler = StandardScaler()
wine_data_scaled = scaler.fit_transform(wine_data_new.drop('quality',axis=1))

In [45]:
#test train split

X = wine_data_scaled
y= wine_data['quality']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [46]:
#train model

model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

In [48]:
predictions= model.predict(X_test)
print("Accuracy: ",round(accuracy_score(y_test,predictions),4))

Accuracy:  0.9782


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

In [None]:
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))

In [51]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(model, X, y, cv=5)
cross_val_scores_rounded = [round(score, 4) for score in cross_val_scores]
print("Cross-Validation Scores:", cross_val_scores_rounded)

Cross-Validation Scores: [0.9782, 0.9825, 0.9782, 0.9825, 0.9737]
