In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data=pd.read_csv('diamonds.csv', sep=',')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
#Describe if the attributes are discrete/continuous, Nominal/Ordinal/Interval/Ratio
print(data.dtypes)

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object


In [5]:
# Convert 'carat', 'depth', 'table', 'x', 'y', 'z' columns to numeric
cols_to_convert = ['carat', 'depth', 'table', 'x', 'y', 'z']

for col in cols_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')
print(data.dtypes)

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object


In [6]:
#Give an account of whether there are data issues (i.e. missing values or corrupted data) and describe them if so.
print(data['cut'].value_counts())
print(data['color'].value_counts())
print(data['clarity'].value_counts())

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64
color
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: count, dtype: int64
clarity
SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: count, dtype: int64


In [4]:
# Check for missing values in the entire dataset
print("Missing values in each column:")
print(data.isnull().sum())

# Check for duplicated rows
print("Duplicated rows: ", data.duplicated().sum())

# Check for rows where all columns are missing
print("Rows with all missing values: ", data.isnull().all(axis=1).sum())

Missing values in each column:
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64
Duplicated rows:  146
Rows with all missing values:  0


In [5]:
#checks for negative values in columns that should only contain positive values (like 'carat', 'depth', 'table', 'x', 'y', 'z', 'price') 
#and zero values in columns that should not contain zero ('carat', 'x', 'y', 'z'). 
#checks if all values in the 'cut', 'color', and 'clarity' columns are within the expected categories.'
# Convert columns to the proper datatype
data['carat'] = pd.to_numeric(data['carat'], errors='coerce')
data['depth'] = pd.to_numeric(data['depth'], errors='coerce')
data['table'] = pd.to_numeric(data['table'], errors='coerce')
data['x'] = pd.to_numeric(data['x'], errors='coerce')
data['y'] = pd.to_numeric(data['y'], errors='coerce')
data['z'] = pd.to_numeric(data['z'], errors='coerce')
data['price'] = pd.to_numeric(data['price'], errors='coerce')

# Check for negative values in columns that should only contain positive values
for col in ['carat', 'depth', 'table', 'x', 'y', 'z', 'price']:
    if data[data[col] < 0].shape[0] > 0:
        print(f"Corrupted data in {col}: ", data[data[col] < 0].shape[0])

# Check for zero values in columns that should not contain zero
for col in ['carat', 'x', 'y', 'z']:
    if data[data[col] == 0].shape[0] > 0:
        print(f"Corrupted data in {col}: ", data[data[col] == 0].shape[0])

# Check for values in 'cut', 'color', 'clarity' that are not in the expected categories
expected_cuts = ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']
expected_colors = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
expected_clarity = ['SI1', 'VS2', 'SI2', 'VS1', 'VVS2', 'VVS1', 'IF', 'I1']

if not set(data['cut']).issubset(expected_cuts):
    print("Unexpected values in 'cut'")
if not set(data['color']).issubset(expected_colors):
    print("Unexpected values in 'color'")
if not set(data['clarity']).issubset(expected_clarity):
    print("Unexpected values in 'clarity'")

Corrupted data in x:  8
Corrupted data in y:  7
Corrupted data in z:  20


In [9]:
# Print summary statistics for specific columns
print(data[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']].describe())

              carat         depth         table         price             x  \
count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   
mean       0.797940     61.749405     57.457184   3932.799722      5.731157   
std        0.474011      1.432621      2.234491   3989.439738      1.121761   
min        0.200000     43.000000     43.000000    326.000000      0.000000   
25%        0.400000     61.000000     56.000000    950.000000      4.710000   
50%        0.700000     61.800000     57.000000   2401.000000      5.700000   
75%        1.040000     62.500000     59.000000   5324.250000      6.540000   
max        5.010000     79.000000     95.000000  18823.000000     10.740000   

                  y             z  
count  53940.000000  53940.000000  
mean       5.734526      3.538734  
std        1.142135      0.705699  
min        0.000000      0.000000  
25%        4.720000      2.910000  
50%        5.710000      3.530000  
75%        6.540000      4.040000  
