In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans, FeatureAgglomeration, MiniBatchKMeans
from scipy.stats import skew, kurtosis, norm
from sklearn.metrics import silhouette_score

sns.set_style(style = 'darkgrid')
warnings.filterwarnings('ignore')


In [36]:
df = pd.read_csv("../input/mcdonalds/mcdonalds.csv") # Loading data
df

In [37]:
df.shape 

In [38]:
df.info()

In [39]:
df.describe()

In [40]:
df.describe(include = 'O') # Summary of categorical variable

In [41]:
df[df.duplicated() == True].shape # 22 rows are duplicated

In [42]:
df = df.drop_duplicates().reset_index(drop = True)
df

## Variable Analysis

In [43]:
df['Like'] = df['Like'].replace('I hate it!-5',-5)
df['Like'] = df['Like'].replace('I love it!+5',+5)

for q in df['Like'].unique().tolist():
    df['Like'] = df['Like'].replace(q,int(q))

In [44]:
df['VisitFrequency'] = df['VisitFrequency'].replace('Never',0) # Least Frequency
df['VisitFrequency'] = df['VisitFrequency'].replace('Once a week',1)
df['VisitFrequency'] = df['VisitFrequency'].replace('More than once a week',2) # Most Frequency
df['VisitFrequency'] = df['VisitFrequency'].replace('Once a month',3)
df['VisitFrequency'] = df['VisitFrequency'].replace('Every three months',4)
df['VisitFrequency'] = df['VisitFrequency'].replace('Once a year',5) # Least Frequency


In [45]:
obj = df.dtypes[df.dtypes == 'object'].index.tolist()
obj

**Variation of Age with Visiting Frequency and Like for different hue:**

In [46]:
for w in obj:
    
    plt.figure(figsize = (14,7))
    
    plt.subplot(121)
    sns.scatterplot('Age','VisitFrequency', hue = w, data = df, s = 50, alpha = 0.5)
    plt.title(f'Age vs Visiting with hue = {w}')
    plt.legend(bbox_to_anchor = (1,1))
    plt.tight_layout()
    
    plt.subplot(122)
    sns.scatterplot('Age','Like', hue = w, data = df, s = 50, alpha = 0.5)
    plt.title(f'Age vs Like with hue = {w}')
    plt.legend(bbox_to_anchor = (1,1))
    plt.tight_layout()
    
    plt.show()

#### No specific Patterns Observed from the above graphs

## Distribution of age variable:

In [47]:
plt.figure(figsize = (14,7))

plt.subplot(121)
df['Age'].plot(kind = 'box')
plt.title(f'Boxplot of age')
plt.tight_layout()

plt.subplot(122)
sns.distplot(df['Age'], fit = norm, hist = False)
plt.title('Distribution of age')
plt.tight_layout()

print(f'Skewness of age: {np.round(skew(df.Age),3)}.')
print(f'Kurtosis of age: {np.round(kurtosis(df.Age),3)}.')

**No outliers observed. Age variable has -ve skewness, indicating it's mean value is less than the median value. Also, the distribution of age variable is platykurtic**

### Distribution of categorical variables can be infered from below:

In [48]:
for r in obj:
    plt.figure(figsize = (14,7))
    plt.pie(df[r].value_counts(), labels = df[r].value_counts().index.tolist(),
            explode = (0.01,0.05), autopct = '%.2f%%')
    plt.title(f'Distribution of {r}')
    plt.legend()
    plt.show()

In [49]:
plt.figure(figsize = (14,7))
sns.heatmap(df.corr(), annot = True)

## Encoding

In [50]:
u = 0

for r in obj:
    l = [i for i in range(len(sorted(df[r].unique())))]
    print(f'In {r}, we replace: {sorted(df[r].unique())} by {l}.')

In [51]:
for e in obj:
    df[e] = df[e].astype('category').cat.codes
df

## Clustering


In [52]:
df_cluster = df.iloc[:,:].values
df_cluster

## K-Means Clustering

In [53]:
wcss = []

for o in range(1,11):
    km = KMeans(n_clusters = o)
    km.fit(df_cluster)
    wcss.append(km.inertia_)

In [54]:
plt.figure(figsize = (14,7))
plt.plot([i for i in range(1,11)], wcss,'k--o')
plt.title('Elbow Method to find optimum number of clusters')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')

**Sudden change in WCSS at k = 3. So we select number of clusters = 3.**

In [55]:
model_km = KMeans(n_clusters=3)
y_predict = model_km.fit_predict(df_cluster)
print(f'Clustering score of model: {np.round(silhouette_score(df_cluster,model_km.labels_)*100, 2)}%.')

col = df.iloc[:,:].columns

In [56]:
plt.figure(figsize = (14,7))
    
plt.subplot(121)
sns.scatterplot( x = 'Age', y = 'VisitFrequency' , data = df, s = 50, hue = y_predict, alpha = 0.5)
plt.xlabel("Age")
plt.ylabel("VisitFrequency")
plt.title('Age vs Visiting Frequecy by clusters')
plt.legend(bbox_to_anchor = (1,1))

plt.subplot(122)
sns.scatterplot( x = 'Age', y = 'Like' , data = df, s = 50, hue = y_predict, alpha = 0.5)
plt.xlabel("Age")
plt.ylabel("Like")
plt.title('Age vs Visiting Frequecy by clusters')
plt.legend(bbox_to_anchor = (1,1))
plt.show()

**From above graph, it is clear that the clusters formed are directly related to the age-category one belongs to. Cluster 0 is from above 50 till 70, while custer 1 starts from around 35 til 50 and cluster 0 is from 20 to 35 years.**


## Agglomerative Clustering

In [57]:
model_agg = AgglomerativeClustering(n_clusters=3, linkage='average').fit(df_cluster)
y_predict_agg = model_agg.fit_predict(df_cluster)
print(f'Clustering score of model: {np.round(silhouette_score(df_cluster,model_agg.labels_)*100, 2)}%.')


In [58]:
plt.figure(figsize = (14,7))
    
plt.subplot(121)
sns.scatterplot( x = 'Age', y = 'VisitFrequency' , data = df, s = 50, hue = y_predict_agg, alpha = 0.5)
plt.xlabel("Age")
plt.ylabel("VisitFrequency")
plt.title('Age vs Visiting Frequecy by clusters')
plt.legend(bbox_to_anchor = (1,1))

plt.subplot(122)
sns.scatterplot( x = 'Age', y = 'Like' , data = df, s = 50, hue = y_predict_agg, alpha = 0.5)
plt.xlabel("Age")
plt.ylabel("Like")
plt.title('Age vs Visiting Frequecy by clusters')
plt.legend(bbox_to_anchor = (1,1))
plt.show()


**From above graph, again it is clear that the clusters formed are directly related to the age-category one belongs to. However, there is a large amount of over lapping between the clusters so it's not possible to know exactly from which point of age a cluster category begins and ends.**

## Profiling Segments

In [59]:
df

In [60]:
df1= df.drop(['Like','Age','VisitFrequency','Gender'],axis=1)
df1

In [61]:
df1.mean().round(2)

**The average values of the transformed binary numeric segmentation variables
indicate that about half of the respondents (55%) perceive McDonald’s as YUMMY,91% believe that eating at McDonald’s is CONVENIENT, but only 9% think that McDonald’s food is SPICY.**

In [62]:
from sklearn.decomposition import PCA
 
pca = PCA(n_components = 11)

In [63]:
df2 = pca.fit_transform(df1)
df2

In [64]:
features=['yummy','convenient','spicy','fattening','greasy','fast','cheap','tasty','expensive','healthy','disgusting']
features

In [71]:
x = df.loc[:, features].values
x

In [72]:
from sklearn.preprocessing import StandardScaler

In [73]:
x = StandardScaler().fit_transform(x)

In [74]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11'])
principalDf

In [75]:
principalDf.describe().round(4)

In [76]:
principalDf.std()

In [77]:
sns.set(style='whitegrid')

sns.scatterplot(x='PC1',
                    y='PC2',
                    data=principalDf)

In [78]:
df['VisitFrequency'].value_counts()

In [79]:
df['yummy'].nunique()

In [80]:
df.isnull().sum()

In [81]:
df['Gender'].nunique()

In [111]:
from sklearn.preprocessing import OrdinalEncoder

In [112]:
ord1=OrdinalEncoder(categories=[['Yes','No']])

In [113]:
ord2=OrdinalEncoder(categories=[["Never","Once a year","Every three months","Once a month","Once a week","More than once a week"]])
ord2

In [114]:
from sklearn.preprocessing import LabelEncoder 

In [115]:
le=LabelEncoder()

In [116]:
df['Gender']=le.fit_transform(df[['Gender']])

In [118]:
df['VisitFrequency'] = LabelEncoder().fit_transform(df['VisitFrequency'])

In [119]:
df

In [121]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

In [122]:
df['Age']=ss.fit_transform(df[['Age']])
df.info()

In [123]:
df['Like'].value_counts()

## Target

In [124]:
df[df['Like']=="I hate it!-5"]=-5

In [125]:
df[df['Like']=="I love it!+5"]=5

In [126]:
sns.distplot(df['yummy'])

In [127]:
sns.distplot(df['VisitFrequency'])

In [128]:
sns.distplot(df['fast'])

In [129]:
from sklearn.cluster import KMeans

In [130]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df)
    distortions.append(kmeanModel.inertia_)

In [131]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()