In [6]:
import pandas as pd
from IPython.display import display

def remove_duplicate(data):
    dups = data.duplicated()

    print(f'Number of duplicate rows: {dups.sum()}')

    data2 = data.drop_duplicates()

    print(f'Number of rows before discarding duplicates: {data.shape[0]}')
    print(f'Number of rows after discarding duplicates: {data2.shape[0]}')

    return data2

def outlier(data):
    data2 = data.drop(['Acidity', 'Quality'], axis=1)

    data2 = data2.apply(pd.to_numeric, errors='coerce')

    Z = (data2 - data2.mean()) / data2.std()

    outliers = (Z.abs() > 3).any(axis=1)

    Z2 = Z[~outliers]

    print(f'Number of rows before discarding outliers: {Z.shape[0]}')
    print(f'Number of rows after discarding outliers: {Z2.shape[0]}')

    return data.loc[~outliers]

def remove_missing(data):
    missing_counts = data.isnull().sum()

    data2 = data.dropna()

    print(f'Number of rows before discarding missing values: {data.shape[0]}')
    print(f'Number of rows after discarding missing values: {data2.shape[0]}')

    return data2

def replace_missing_value_by_median(data):
    data2 = data.copy()

    data2['Weight'] = pd.to_numeric(data2['Weight'], errors='coerce')

    print('Before replacing missing values:')
    print(data2['Weight'].iloc[20:25])

    median_weight = data2['Weight'].median()
    data2['Weight'] = data2['Weight'].fillna(median_weight)

    print('\nAfter replacing missing values by median:')
    print(data2['Weight'].iloc[20:25])

    return data2


def main():
    data = pd.read_csv('apple_quality.csv', header=None)
    data.columns = ['A_id', 'Size', 'Weight', 'Sweetness',
                    'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',
                    'Quality']
    print(data)
    
    print("Do you want to view data?")
    response = input().lower()
    if response == 'yes':
        view(data)

    print("Do you want to remove noise and further preprocess data?")
    response = input().lower()
    if response == 'yes':
        data = remove_duplicate(data)
        data = outlier(data)
        data = remove_missing(data)
        data = replace_missing_value_by_median(data)
    else:
        quit()

main()


      A_id          Size        Weight     Sweetness   Crunchiness  \
0     A_id          Size        Weight     Sweetness   Crunchiness   
1        0  -3.970048523  -2.512336381   5.346329613  -1.012008712   
2        1  -1.195217191  -2.839256528   3.664058758   1.588232309   
3        2  -0.292023862  -1.351281995  -1.738429162  -0.342615928   
4        3  -0.657195773  -2.271626609   1.324873847  -0.097874716   
...    ...           ...           ...           ...           ...   
3997  3996  -0.293118007   1.949252549   -0.20401993  -0.640195579   
3998  3997  -2.634515299   -2.13824672  -2.440461285   0.657222891   
3999  3998  -4.008003744  -1.779337107   2.366396966  -0.200329367   
4000  3999    0.27853965  -1.715505028   0.121217251  -1.154074758   
4001   NaN           NaN           NaN           NaN           NaN   

        Juiciness      Ripeness                            Acidity  Quality  
0       Juiciness      Ripeness                            Acidity  Quality  
1  

 yes


Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
1,0,-3.970048523,-2.512336381,5.346329613,-1.012008712,1.844900361,0.329839797,-0.491590483,good
2,1,-1.195217191,-2.839256528,3.664058758,1.588232309,0.853285795,0.867530082,-0.722809367,good
3,2,-0.292023862,-1.351281995,-1.738429162,-0.342615928,2.838635512,-0.038033328,2.621636473,bad
4,3,-0.657195773,-2.271626609,1.324873847,-0.097874716,3.637970491,-3.413761338,0.790723217,good


Do you want to remove noise and further preprocess data?


 yes


Number of duplicate rows: 0
Number of rows before discarding duplicates: 4002
Number of rows after discarding duplicates: 4002
Number of rows before discarding outliers: 4002
Number of rows after discarding outliers: 3912
Number of rows before discarding missing values: 3912
Number of rows after discarding missing values: 3911
Before replacing missing values:
20    0.356467
23   -0.698501
24   -0.753757
25   -1.428085
26   -3.504792
Name: Weight, dtype: float64

After replacing missing values by median:
20    0.356467
23   -0.698501
24   -0.753757
25   -1.428085
26   -3.504792
Name: Weight, dtype: float64
