In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
wine = pd.read_csv('./Datasets/combined_qualities.csv')

In [3]:
wine.shape

(6497, 13)

## Group by Custom Class

We can group the dataset using `color` column easily but
pH is a quantitative variable without clear categories. However, there is a simple fix to this.  
We can create a categorical variable from a quantitative variable by creating our own categories.

### Creating Custom Classes

Let's create a new column called `acidity_levels` with these categories:
`Acidity Levels`:
- **High**: Lowest 25% of pH values
- **Moderately High**: 25% - 50% of pH values
- **Medium**: 50% - 75% of pH values
- **Low**: 75% - max pH value


In [7]:
wine.ph.describe()

count    6497.000000
mean        3.218501
std         0.160787
min         2.720000
25%         3.110000
50%         3.210000
75%         3.320000
max         4.010000
Name: ph, dtype: float64

In [10]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [2.720000, 3.110000, 3.210000, 3.320000, 4.010000]

# Filled using the above `describe` method

In [9]:
# Labels for the four acidity level groups
bin_names = ['High', 'Moderately High', 'Medium', 'Low']

In [11]:
# Creates acidity_levels column
wine['acidity_levels'] = pd.cut(wine['ph'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,Moderately High
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,Medium
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,Moderately High
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low


### Grouping by custom created `acidity_levels` column

In [12]:
wine.groupby('acidity_levels')['quality'].mean()

acidity_levels
High               5.783343
Moderately High    5.784540
Medium             5.850832
Low                5.859593
Name: quality, dtype: float64

# Saving to CSV

In [13]:
wine.to_csv('./Datasets/with_custom_class.csv', index=False)