## In this notebook, we will be doing the exploratory data analysis (EDA) of the 'cubic_zirconia.csv' dataset provided.

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# reading our dataframe as df
df = pd.read_csv('../input/gemstone-price-prediction/cubic_zirconia.csv')
df.head()

In [None]:
# lets look at some descriptive features about our data
df.info()

### There are a total of 26967 rows and 11 columns in our data and we can see that there are no missing values in our data
### We can see the data types of each column 


In [None]:
# Let's see some descriptive statistics of our data
df.describe()

### Here we can get an overall idea of max , min , avgerage values about our dataset. We will discuss it more during the univariate analysis

### Let's first start from our target variable 'price'

In [None]:
df['price'].head()

In [None]:
df['price'].describe()

In [None]:
plt.figure(figsize = (10,6))
sns.distplot(df['price'])

In [None]:
df['price'].skew()

In [None]:
df['price'].kurt()

### We can see the positive value of skewness shows that the data is positively skewed and we can see that too from the plot
### Also positive kurtosis value also states that the data is leptokurtically distributed.

## Univariate Analysis

In [None]:
df['cut'].head() 

In [None]:
df['cut'].value_counts()

In [None]:
df['color'].value_counts()

In [None]:
df['carat'].value_counts()

In [None]:
# carat distribution
plt.figure(figsize = (10,6))
sns.distplot(df['carat'])

In [None]:
df['clarity'].value_counts()

In [None]:
# Binary Features
plt.figure(figsize=(22, 6))
#fig, axs = plt.subplot(ncols=2)

# Passenger Count
plt.subplot(131)
sns.countplot(df['clarity'])
plt.xlabel('Clarity')
plt.ylabel('Frequency')


# vendor_id
plt.subplot(132)
sns.countplot(df['color'])
plt.xlabel('Color')
plt.ylabel('Frequency')

# store_and_fwd_flag
plt.subplot(133)
sns.countplot(df['cut'])
plt.xlabel('Cut')
plt.ylabel('Frequency')

## Observations

### 1. SI1 clarity gemstone have highest frequency and I1 have the lowest frquency.

### 2. Ideal cut gemstone have highest frequency where as Fair cut gemstone have lowest frequency

### 3. G color gemstone have highest frequency where as j color gemstone have lowest frequency

In [None]:
df['depth'].min(), df['depth'].max()

In [None]:
df['carat'].min(), df['carat'].max()

In [None]:
df['price'].min(), df['price'].max()

## Multivariate Analysis

In [None]:
df.columns

#### Lets see relationship between gemstone carat and price

In [None]:
plt.figure(figsize = (15 , 8))
sns.lineplot(x = 'carat' , y = 'price' , data = df , err_style = 'band')

We can obseravtion from this plot that carat and price are in almost a linear relationship.

#### Lets see relationship between

In [None]:
plt.figure(figsize = (15,8))
sns.lineplot(x = 'cut' , y = 'price' , data = df)

#### We can observe from this plot and previous observations that Ideal cut has the highest frequency and fairly less price (0-Ideal) as compared to the rest of the cut types we suggest this gemstone is common.

In [None]:
plt.figure(figsize = (15,8))
sns.lineplot(x = 'clarity' , y = 'price' , data = df)

#### WS1 and SI2 are at the lowest and highest price ranges wheres the gemstone with highest frequency Gemstone SI1 have price ranges in between lowest and highest.

In [None]:
plt.figure(figsize = (15,8))
sns.lineplot(x = 'color' , y = 'price' , data = df)

### Gemstone J and I are not only the two pricest categories of gemstones but also the have the two lowest frequency which suggest these categories J and I are the rarest gemstones.

In [None]:
df.clarity.value_counts()
plt.figure(figsize=(22, 6))
df_sub = df[df['price'] < 10000]
sns.boxplot(x="clarity", y="price", data=df_sub)
plt.show()

In [None]:
df.cut.value_counts()
plt.figure(figsize=(22, 6))
df_sub = df[df['price'] < 10000]
sns.boxplot(x="cut", y="price", data=df_sub)
plt.show()

In [None]:
df.color.value_counts()
plt.figure(figsize=(22, 6))
df_sub = df[df['price'] < 10000]
sns.boxplot(x="color", y="price", data=df_sub)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df = df.drop(['cut', 'clarity', 'carat',
       'price','color'],
        axis=1)
corr = df.apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, 
                 linewidths=.2, cmap="YlGnBu")