# Import the required libraries and load the data

In [None]:
#importing required libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,accuracy_score,auc,roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

In [None]:
#loading the data
df = pd.read_csv('C:/Users/anuj/Desktop/Anuj/Python/GL Study Material/Project 2/renttherunway.csv')
df.head(5)

In [None]:
df1 = df.copy()

# Data cleansing and Exploratory data analysis

In [None]:
print("Shape of the dataset:", df.shape)

In [None]:
print("Information of the dataset:", df.info())

In [None]:
#descrition of numeric data
df.describe()

In [None]:
#description of non-numeric data
df.describe(include='O')

In [None]:
#checking for unique values
df.nunique()

- Columns like Unnamed_0,user_id,review_text,review_date,review_summary can be dropped. This is because of too many unique values. Since we are dropping data related to review thus review date can also be removed
- Weight and Height are non-numeric. Needs to convert them to numeric
- Age has a maximum value of 117. Need to check teh plot for the same to see if there is any skewness in data

In [None]:
#Checking duplicate values
df[df.duplicated()]

 - No duplicate values found

In [None]:
#dropping redundant columns
redundant_columns = ['Unnamed: 0', 'user_id','review_text','review_summary','review_date','item_id']
df = df.drop(redundant_columns, axis=1)
df.head()

In [None]:
#converting weight into numeric datatype
df['weight'] = df['weight'].str.replace('lbs', '')
df['weight'] = pd.to_numeric(df['weight'], errors = 'coerce')


In [None]:
#converting height into inches which is of numeric data typr
df['height'] = df['height'].str.split("'").str.get(0).str.replace('"','').astype(float)*12 + df['height'].str.split("'").str.get(1).str.replace('"','').astype(float)


In [None]:
df.info()

- Height and weight are now converted into numeric datatype

In [None]:
## Check the unique entries of 'rented for' column
df['rented for'].unique()

In [None]:
df['rented for'] = df['rented for'].replace(['party: cocktail'], 'party')
df['rented for'].unique()

In [None]:
#Checking  for missing value
missing_percentage = (df.isna().mean() * 100).round(2)
print("Percentage of missing values:")
print(missing_percentage)

In [None]:
df.describe()

- weight and age has large difference between minimum and maximum value. Median will be used to fill the missing values. 
- For height,rating, will use mean to fill the missing values as the missing value percentage is low.
- for bust size,rented for and body type, will use mode as they are ofcategory datatype and top values seems to have decent frequency

In [None]:
## Imputing the missing values.
df['weight'] = df['weight'].fillna(df['weight'].median())
df['age'] = df['age'].fillna(df['age'].median())
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['height'] = df['height'].fillna(df['height'].mean())


In [None]:
print('mode of column bust size:',df['bust size'].mode())
print('mode of column body type:',df['body type'].mode())
print('mode of column rented for:',df['rented for'].mode())

In [None]:
#filling the missing values with the common values
df['bust size'] = df['bust size'].fillna('34b')
df['body type'] = df['body type'].fillna('hourglass')
df['rented for'] = df['rented for'].fillna('wedding')

In [None]:
#Checking  for missing value
missing_percentage_imputed = (df.isna().mean() * 100).round(2)
print("Percentage of missing values after imputing:")
print(missing_percentage_imputed)

- No missing values are present

In [None]:
#checking for outliers in age
sns.boxplot(df['age'])

In [None]:
sns.kdeplot(df.age, color="blue", shade = True)

In [None]:
# Let's check ratio of age greater than 100
ratio_age_greater_than_100 = (len(df[df['age'] >= 100])/len(df['age']))*100
ratio_age_equal_to_100 = (len(df[df['age'] == 0])/len(df['age']))*100
print(ratio_age_greater_than_100,ratio_age_equal_to_100)

- As the number of datapoint for age >= 100 and age = 0 is very small we can drop these rows

In [None]:
#dropping the respective rows
df.drop(df.index[(df['age'] >= 100)],axis=0,inplace=True)
df.drop(df.index[(df['age'] == 0)],axis=0,inplace=True)
ratio_age_greater_than_100 = (len(df[df['age'] > 100])/len(df['age']))*100
ratio_age_equal_to_100 = (len(df[df['age'] == 0])/len(df['age']))*100
print(ratio_age_greater_than_100,ratio_age_equal_to_100)


In [None]:
sns.boxplot(df['age'])

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(df['rented for'])
plt.show()

In [None]:
(len(df[df['rented for'] == "wedding"])+len(df[df['rented for'] == "formal affair"])+len(df[df['rented for'] == "party"]))/len(df['rented for'])*100

- Dropping rows having 'rented for' value as vacation, date,other, everyday and work as account for only 30% of data

In [None]:
#dropping the respective rows
df.drop(df.index[(df['rented for'] == "vacation")],axis=0,inplace=True)
df.drop(df.index[(df['rented for'] == "other")],axis=0,inplace=True)
df.drop(df.index[(df['rented for'] == "date")],axis=0,inplace=True)
df.drop(df.index[(df['rented for'] == "everyday")],axis=0,inplace=True)
df.drop(df.index[(df['rented for'] == "work")],axis=0,inplace=True)
df.shape

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(df['fit'])
plt.show()

- Dropping rows having fit value as small and large because of thier insignifcant contribution

In [None]:
#dropping the respective rows
df.drop(df.index[(df['fit'] == "small")],axis=0,inplace=True)
df.drop(df.index[(df['fit'] == "large")],axis=0,inplace=True)
df.shape

In [None]:
#description of non-numeric data
df.describe(include='O')

In [None]:
## Label encoding
df_cat = df.select_dtypes(include='object')
le = LabelEncoder()
for col in df_cat:
    df[col] = le.fit_transform(df[col])


In [None]:
## Standardization
df_scaled = df.copy()
scaled_features = StandardScaler().fit_transform(df_scaled.values)
scaled_features_df = pd.DataFrame(scaled_features, index=df_scaled.index, columns=df_scaled.columns)

In [None]:
scaled_features_df.head()

## Principal Component Analysis

In [None]:
## Calculating covariance matrix
cov_matrix = np.cov(scaled_features_df.T)
print('Covariance matrix','\n',cov_matrix)

In [None]:
## Calculating eigen values and eigen vectors
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen vectors:','\n',eig_vecs)
print('\n')
print('Eigen values:','\n',eig_vals)

In [None]:
## Calculating the Variance explained and the cummulative variance explained
total = sum(eig_vals)
var_exp = [ (i/total)*100  for i in sorted(eig_vals,reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print('Variance Explained: ',var_exp)
print('Cummulative Variance Explained: ',cum_var_exp)

- We can see that approximately 80 - 90% of variance is explained by the first 6 variables.
- so, we can choose the optimal number of principal components as 6.

In [None]:
## Fitting the PCA model
pca=PCA(n_components = 6)
pca.fit(scaled_features_df)

In [None]:
data_pca = pca.transform(scaled_features_df)
data_pca = pd.DataFrame(data_pca,columns=['PC1','PC2','PC3','PC4','PC5','PC6'])
data_pca.head()

## K-means Clustering

In [None]:
cluster_range = range(1,15)
cluster_errors = []

for num_clusters in cluster_range:
   
    clusters = KMeans(num_clusters, n_init=10)
    clusters.fit(data_pca)
    
    labels = clusters.labels_
    centroids = clusters.cluster_centers_
    
    cluster_errors.append(clusters.inertia_)

clusters_df = pd.DataFrame({'num_clusters':cluster_range, 
                           'cluster_errors':cluster_errors})

clusters_df

In [None]:
## Elbow method
plt.figure(figsize=[12,6])
plt.title('The Elbow Method')
plt.xlabel('Number of clusters using PCA')
plt.plot(clusters_df['num_clusters'],clusters_df['cluster_errors'],marker='o',color='b')
plt.show()

- From the Elbow plot, we can see that at K=4 the interia starts to drop significantly. So we will do it using 5 clusters. 
- The clusters are labeled as 0,1,2,3.

In [None]:
## Fit the KMeans clustering model using the obtained optimal K
kmeans = KMeans(n_clusters=4, n_init=15, random_state=100)
kmeans.fit(data_pca)

In [None]:
## creating a dataframe of the labels
label = pd.DataFrame(kmeans.labels_,columns=['Label'])

In [None]:
## joining the label dataframe to the data_pca dataframe
kmeans_df = data_pca.join(label)
kmeans_df.head()

In [None]:
kmeans_df['Label'].value_counts()

In [None]:
##visulalising the clusters formed
sns.scatterplot(kmeans_df['PC3'],kmeans_df['PC2'],hue = 'Label', data = kmeans_df )
plt.show()

In [None]:
#computing silhouette score
from sklearn.metrics import silhouette_score

kmeans_score = []

for i in range(2,15):
    kmeans = KMeans(n_clusters=i)
    kmeans = kmeans.fit(data_pca)
    labels = kmeans.predict(data_pca)
    print(i,silhouette_score(data_pca,labels))

In [None]:
## Agglomertaive Clustering

In [None]:
plt.figure(figsize=[18,7])
merg = linkage(data_pca, method='ward')
dendrogram(merg, leaf_rotation=90,)
plt.xlabel('Datapoints')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

for i in range(2,15):
    hier = AgglomerativeClustering(n_clusters=i)
    hier = hier.fit(scaled_features_df)
    labels = hier.fit_predict(scaled_features_df)
    print(i,silhouette_score(scaled_features_df,labels))

In [None]:
## Building hierarchical clustering model using the optimal clusters as 4
hie_cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',
                                     linkage='ward')
hie_cluster_model = hie_cluster.fit(scaled_features_df)

In [None]:
## Creating a dataframe of the labels
df_label1 = pd.DataFrame(hie_cluster_model.labels_,columns=['Labels'])
df_label1.head(5)

In [None]:
## joining the label dataframe with unscaled initial scale dataframe 
df_hier = dfc.join(df_label1)
df_hier.head()

In [None]:
sns.barplot(df_hier['Labels'],df_hier['Num_Total_Purchases'])
plt.show()