In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import libraries and dependencies
import pandas as pd
import numpy as np
import warnings 
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
#import the data
telco=pd.read_csv("../input/telco-segmentation/telco_csv.csv")
telco.head()

#### Data Exploration

In [None]:
telco.info()

##### 

##### Continuous Variable summary

In [None]:
# Create Data audit Report for continuous variables
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

#Continuous variable summary
telco.select_dtypes(["float64","int64"]).apply(continuous_var_summary)

##### Outlier Treatment

In [None]:
# Handling Outliers and clipping at lower values at 1% and upper cliiping at 99%
telco_new = telco.select_dtypes(["float64","int64"]).apply(lambda x: x.clip(lower = x.quantile(0.01), upper = x.quantile(0.99)))

In [None]:
# visualize correlation matrix in Seaborn using a heatmap
plt.figure(figsize = (15, 8))
sns.heatmap(telco_new.corr())
plt.show()

In [None]:
#drop some variables looks numerical nature but they are not numerical (eg: Region, Custcat)
telco_new.drop( ['ed', 'address', 'marital', 'reside', 'retire', 'gender','region',
                    'custcat', 'employ'], 
               axis = 1, inplace = True)

### Standardrizing data
To put data on the same scale for all the features since we will be using distance based technique for our segementation

In [None]:
# import the package
from sklearn.preprocessing import StandardScaler

#Standardize the data
sc=StandardScaler()

#applying standardization on our data
telco_scaled=sc.fit_transform(telco_new)
pd.DataFrame(telco_scaled).head()

In [None]:
# import packages for clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics

#### Variable Reduction using PCA

In [None]:
telco_scaled.shape

In [None]:
pca=PCA(n_components=21).fit(telco_scaled)

In [None]:
 # eigen values
print(pca.explained_variance_)   

In [None]:
# amount of variance that each PC explains
print(pca.explained_variance_ratio_)

In [None]:
#Creating a dataframe for the above eigen values

pd.DataFrame({'Eigen_Values':pca.explained_variance_,
             'Variance':np.round((pca.explained_variance_ratio_*100),2),
             'Cumulative_Variance':np.round( np.cumsum( pca.explained_variance_ratio_) * 100, 2 )})

In [None]:
##number of components have choosen as 6 based on cumulative variacne is explaining > 75 % and individual component explaining > 0.8 variance

In [None]:
pca_new=PCA(n_components=6).fit(telco_scaled)

In [None]:
#Applying the transform function on to the scaled data
reduced_cr = pca_new.transform( telco_scaled )  # the out put is Components (C1, C2, ...)

In [None]:
#Storing in a Dataframe named dimensions
dimensions = pd.DataFrame(reduced_cr)
dimensions.columns = [ "C1", "C2", "C3", "C4", "C5", "C6" ]
dimensions.head()

### Performing Clustering

##### Choosing number clusters using Silhouette Coefficient   -- SC

In [None]:
# calculate SC for K=3 through K=20
k_range = range(3, 21)
scores = []
for k in k_range:
    km = KMeans(n_clusters = k, random_state = 123)
    km.fit( dimensions )
    scores.append( metrics.silhouette_score(dimensions, km.labels_) )

In [None]:
# plot the results
plt.figure(figsize=(15,8))
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)

#### Note
The solution can be 5 based on the SC score. If we take highest SC score, 6 segment solution is best

#### Segment Distribution through Elbow analysis
##### It will tell us how much variance is explained by increase of each cluster

In [None]:
#Creatimg cluster errors for various cluster groups
cluster_range=range(3,21)
cluster_errors=[]
for num_cluster in cluster_range:
    clusters=KMeans(num_cluster).fit(dimensions)
    cluster_errors.append( clusters.inertia_ )

In [None]:
clusters_df = pd.DataFrame( { "num_clusters": cluster_range, "cluster_errors": cluster_errors } )
clusters_df.head(5)

In [None]:
plt.figure(figsize=(15,8))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
plt.show()

#### Visualization of the K means model with number of clusters =6

In [None]:
Kmeans_vis=KMeans(n_clusters=6,init='k-means++',random_state=123)
#predict the labels of clusters.
label = Kmeans_vis.fit_predict(dimensions)

In [None]:
import matplotlib.pyplot as plt
# Getting the Centroids
centroids = Kmeans_vis.cluster_centers_
u_labels = np.unique(label)
for i in u_labels:
    plt.scatter(dimensions.iloc[label == i, 0] , dimensions.iloc[label == i , 1] , label = i)
    plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k')
