In [1]:
# load dataset
import pandas as pd

# Load the dataset from CSV file

df = pd.read_csv("apple_quality.csv")

# Remove the string value from the Acidity column
df = df[df['Acidity'] != 'Created_by_Nidula_Elgiriyewithana']

# Turn Acidity column into a float
df['Acidity'] = pd.to_numeric(df['Acidity'])

# Display column names, non-null count, and data type
# Display row count and column count
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   object 
dtypes: float64(8), object(1)
memory usage: 312.5+ KB


(4000, 9)

In [2]:
# finding and removing outliers

# Compute the IQR Q1, Q2, Q3, IQR, min and max.
X = df.loc[:, ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']]
Q1 = X.quantile(0.25)
Q2 = X.quantile(0.5)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
min = X.min()
max = X.max()

print("\nQ1:\n{} , \nQ2:\n{} , \nQ3:\n{} , \nIQR:\n{} , \nmin:\n{} , \nmax:\n{}".format(Q1, Q2, Q3, IQR, min, max))

# print out the number outliers that were found
outliers = (X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))
print("\nOutliers:\n", outliers.sum())


Q1:
Size          -1.816765
Weight        -2.011770
Sweetness     -1.738425
Crunchiness    0.062764
Juiciness     -0.801286
Ripeness      -0.771677
Acidity       -1.377424
Name: 0.25, dtype: float64 , 
Q2:
Size          -0.513703
Weight        -0.984736
Sweetness     -0.504758
Crunchiness    0.998249
Juiciness      0.534219
Ripeness       0.503445
Acidity        0.022609
Name: 0.5, dtype: float64 , 
Q3:
Size           0.805526
Weight         0.030976
Sweetness      0.801922
Crunchiness    1.894234
Juiciness      1.835976
Ripeness       1.766212
Acidity        1.510493
Name: 0.75, dtype: float64 , 
IQR:
Size           2.622291
Weight         2.042747
Sweetness      2.540347
Crunchiness    1.831470
Juiciness      2.637262
Ripeness       2.537889
Acidity        2.887917
dtype: float64 , 
min:
Size          -7.151703
Weight        -7.149848
Sweetness     -6.894485
Crunchiness   -6.055058
Juiciness     -5.961897
Ripeness      -5.864599
Acidity       -7.010538
dtype: float64 , 
max:
Size   

In [3]:
# Calculate the boundaries for each feature
def boundaries(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    print(feature)
    print(lower_limit, upper_limit)
    outliers = df[(df[feature] < lower_limit) | (df[feature] > upper_limit)]
    #print(outliers)

# Find the boundaries for the 7 features
for i in df[['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']]:
    boundaries(df, i)

Size
-5.75020099175 4.73896291425
Weight
-5.075890391874999 3.0950965391249996
Sweetness
-5.54894553775 4.61244239625
Crunchiness
-2.6844403373750003 4.641438949625
Juiciness
-4.7571791193749995 5.791869691624999
Ripeness
-4.578509627375 5.573044401624999
Acidity
-5.7092993302499995 5.84236800775


In [7]:
# function for removing outliers
def remove_outliers(df, features):
    # Initialize a mask that starts as all True
    mask = pd.Series([True] * len(df))

    # Find the lower and upper limits for each feature
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        # Update the mask to exclude outliers for the current feature
        mask &= (df[feature] >= lower_limit) & (df[feature] <= upper_limit)

    # Return the DataFrame without outliers
    return df[mask]

# Define the features
features = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']

# Get the cleaned DataFrame
cleaned_df = remove_outliers(df, features)

# Display the number of rows the cleaned dataframe has
cleaned_df.info()
cleaned_df.head()

# Display how many outliers were removed
outliers_removed = len(df) - len(cleaned_df)
print("Total rows removed:", outliers_removed)

<class 'pandas.core.frame.DataFrame'>
Index: 3790 entries, 1 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         3790 non-null   float64
 1   Size         3790 non-null   float64
 2   Weight       3790 non-null   float64
 3   Sweetness    3790 non-null   float64
 4   Crunchiness  3790 non-null   float64
 5   Juiciness    3790 non-null   float64
 6   Ripeness     3790 non-null   float64
 7   Acidity      3790 non-null   float64
 8   Quality      3790 non-null   object 
dtypes: float64(8), object(1)
memory usage: 296.1+ KB
Total rows removed: 210


In [67]:
# svm model

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.svm import SVC
import matplotlib.pyplot as plt


# load in dataset

# cleaned_df.head()
print("X\n")
X = cleaned_df.iloc[:,1:-1]
print(X)

print("Y\n")
y = cleaned_df.iloc[:,[-1]]
y = y.values.ravel()
print(y)

print("dataset size: {}".format(X.shape[0]), "\n")

# Spliting the dataset into train and test sets [80:20]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize attributes of dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Initialize model
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)


# make prediction
y_pred = svm_model.predict(X_test)


# get performance metrics
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred, pos_label='good', average='binary')
cm = confusion_matrix(y_test, y_pred)

print("accuracy: ", accuracy)
print("confusion matrix: \n", cm)
print("f1 score: ", f1_score)
    

X

          Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness  \
1    -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530   
2    -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033   
3    -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761   
4     1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849   
5    -3.425400 -1.409082  -1.913511    -0.555775  -3.853071  1.914616   
...        ...       ...        ...          ...        ...       ...   
3995  0.059386 -1.067408  -3.714549     0.473052   1.697986  2.244055   
3996 -0.293118  1.949253  -0.204020    -0.640196   0.024523 -1.087900   
3997 -2.634515 -2.138247  -2.440461     0.657223   2.199709  4.763859   
3998 -4.008004 -1.779337   2.366397    -0.200329   2.161435  0.214488   
3999  0.278540 -1.715505   0.121217    -1.154075   1.266677 -0.776571   

       Acidity  
1    -0.722809  
2     2.621636  
3     0.790723  
4     0.501984  
5    -2.981523  
...        ...  
3