# Seatwork 6.1 Exploratory Data Analysis on Your Own Dataset

In [81]:
filepath = '/content/apple_quality.csv'

In [82]:
import pandas as pd
import numpy as np

In [83]:
data = pd.read_csv(filepath)

In [84]:
data

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...,...
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good
3999,3999.0,0.278540,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796456,good


# Data Cleaning

In [86]:
# removing nan values (only 1 which is the last row, the author rights)
df = data.dropna()
df = data.replace([np.inf, -np.inf], np.nan).dropna()

In [87]:
# checking for duplicate rows
df.duplicated().sum()

0

In [88]:
# setting data type of A_id as int from float64
df['A_id'] = df['A_id'].astype(int)

In [89]:
df

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...,...
3995,3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784369,bad
3996,3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good


In [90]:
# setting A_id as index
df.set_index('A_id', inplace=True)

In [91]:
df

Unnamed: 0_level_0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
A_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...
3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784369,bad
3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good


In [92]:
# column names
df.columns

Index(['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness',
       'Acidity', 'Quality'],
      dtype='object')

In [93]:
# types of the data
df.dtypes

Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object

In [94]:
# total no of records
len(df)

4000

In [95]:
# converting acidity from object to float
df['Acidity'] = pd.to_numeric(df['Acidity'], errors='coerce')

In [96]:
df.dtypes

Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity        float64
Quality         object
dtype: object

In [97]:
# if quality is good, grade == 1, else == 0
df['Grade'] = np.where(df['Quality'] == 'good', 1, 0)

In [98]:
df

Unnamed: 0_level_0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality,Grade
A_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,good,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,good,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,good,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good,1
...,...,...,...,...,...,...,...,...,...
3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784,bad,0
3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235,good,1
3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611,bad,0
3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720,good,1


# Data Analysis and Statistics

Mean
- The size and sweetness of the samples are slightly below average.
- The weight of the samples is significantly below average.
- The crunchiness, juiciness, ripeness, and quality of the samples are all above average.
- The acidity of the samples is slightly above average.

In conclusion, the samples all appear to have good attributes in terms of its crunchiness, juiciness, ripeness, and quality. Size, sweetness, and weight are all below average while the acidity is slightly above average.


In [99]:
# mean for the size, weight, sweetness, crunchiness, juiciness, ripeness, acidity, quality (using numpy)
size_mean = np.mean(df['Size'])
weight_mean = np.mean(df['Weight'])
sweetness_mean = np.mean(df['Sweetness'])
crunchiness_mean = np.mean(df['Crunchiness'])
juiciness_mean = np.mean(df['Juiciness'])
ripeness_mean = np.mean(df['Ripeness'])
acidity_mean = np.mean(df['Acidity'])
quality_mean = np.mean(df['Grade'])

print('Mean of Size:', size_mean)
print('Mean of Weight:', weight_mean)
print('Mean of Sweetness:', sweetness_mean)
print('Mean of Crunchiness:', crunchiness_mean)
print('Mean of Juiciness:', juiciness_mean)
print('Mean of Ripeness:', ripeness_mean)
print('Mean of Acidity:', acidity_mean)
print('Mean of Quality:', quality_mean)

Mean of Size: -0.50301462982675
Mean of Weight: -0.9895465445945
Mean of Sweetness: -0.47047851978824995
Mean of Crunchiness: 0.9854779038585
Mean of Juiciness: 0.5121179684932501
Mean of Ripeness: 0.4982774280305
Mean of Acidity: 0.07687729571600001
Mean of Quality: 0.501


Median
- The median size, weight, and sweetness of the samples are slightly below average.
- The median crunchiness, juiciness, ripeness, and quality of the samples are above average.
- The median acidity of the samples is slightly below average.

In conclusion, the samples have good attributes in terms of crunchiness, juiciness, ripeness, and quality, but they are slightly below average in terms of size, weight, sweetness, and acidity.


In [100]:
# median for the size, weight, sweetness, crunchiness, juiciness, ripeness, acidity, quality (using numpy)
# nanmeadian() funtion ignores nan values
size_median = np.median(df['Size'])
weight_median = np.median(df['Weight'])
sweetness_median = np.median(df['Sweetness'])
crunchiness_median = np.median(df['Crunchiness'])
juiciness_median = np.median(df['Juiciness'])
ripeness_median = np.median(df['Ripeness'])
acidity_median = np.median(df['Acidity'])
quality_median = np.median(df['Grade'])

print('Median of Size:', size_median)
print('Median of Weight:', weight_median)
print('Median of Sweetness:', sweetness_median)
print('Median of Crunchiness:', crunchiness_median)
print('Median of Juiciness:', juiciness_median)
print('Median of Ripeness:', ripeness_median)
print('Median of Acidity:', acidity_median)
print('Median of Quality:', quality_median)

Median of Size: -0.5137025125000001
Median of Weight: -0.9847364865
Median of Sweetness: -0.5047584635
Median of Crunchiness: 0.9982494390000001
Median of Juiciness: 0.5342186584999999
Median of Ripeness: 0.5034447135
Median of Acidity: 0.022608968
Median of Quality: 1.0


Standard Deviation
- Size, weight, sweetness, and ripeness have relatively high variability, indicated by their standard deviations.
- Crunchiness and juiciness have moderate variability.
- Acidity has the highest variability among the attributes, as indicated by its highest standard deviation.
- Quality has the lowest variability among the attributes, with a standard deviation close to 0.5.

In conclusion, these standard deviations suggest that there is variability in the dataset across all attributes, with acidity showing the most variation and quality being the most consistent.


In [101]:
# standard deviation for the size, weight, sweetness, crunchiness, juiciness, ripeness, acidity, quality (using numpy)
size_std = np.std(df['Size'])
weight_std = np.std(df['Weight'])
sweetness_std = np.std(df['Sweetness'])
crunchiness_std = np.std(df['Crunchiness'])
juiciness_std = np.std(df['Juiciness'])
ripeness_std = np.std(df['Ripeness'])
acidity_std = np.std(df['Acidity'])
quality_std = np.std(df['Grade'])

print('Standard Deviation of Size:', size_std)
print('Standard Deviation of Weight:', weight_std)
print('Standard Deviation of Sweetness:', sweetness_std)
print('Standard Deviation of Crunchiness:', crunchiness_std)
print('Standard Deviation of Juiciness:', juiciness_std)
print('Standard Deviation of Ripeness:', ripeness_std)
print('Standard Deviation of Acidity:', acidity_std)
print('Standard Deviation of Quality:', quality_std)

Standard Deviation of Size: 1.9278176664540305
Standard Deviation of Weight: 1.602306888228833
Standard Deviation of Sweetness: 1.9431977136530587
Standard Deviation of Crunchiness: 1.4025818486010257
Standard Deviation of Juiciness: 1.9300443723029157
Standard Deviation of Ripeness: 1.8741924577105886
Standard Deviation of Acidity: 2.1100058362278236
Standard Deviation of Quality: 0.499998999999


In [102]:
# summarization of the data
df.describe()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Grade
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-0.503015,-0.989547,-0.470479,0.985478,0.512118,0.498277,0.076877,0.501
std,1.928059,1.602507,1.943441,1.402757,1.930286,1.874427,2.11027,0.500062
min,-7.151703,-7.149848,-6.894485,-6.055058,-5.961897,-5.864599,-7.010538,0.0
25%,-1.816765,-2.01177,-1.738425,0.062764,-0.801286,-0.771677,-1.377424,0.0
50%,-0.513703,-0.984736,-0.504758,0.998249,0.534219,0.503445,0.022609,1.0
75%,0.805526,0.030976,0.801922,1.894234,1.835976,1.766212,1.510493,1.0
max,6.406367,5.790714,6.374916,7.619852,7.364403,7.237837,7.404736,1.0
