# How Not To Put A Nation To Sleep
## Blake Whittington and Gabriele Spiridaviciute
***
## Are People a Good Judge of Their Own Sleep and Health?

### Cluster Desirability Analysis
Now that we have a a cleaned dataset with clusters associated with every surveyor, we can start to look at the desirability of each cluster. In this notebook the desirability will be based off frequency of each cluster and the clusters that are homogenous for health and sleep ratings

In [1]:
import pandas as pd
import numpy as np
mod = '../data_file/cleaned_data_pickle/2013_df_MOD.pickle'
cluster_df = pd.read_pickle(mod)

In [2]:
cluster_df.head(60)

Unnamed: 0,id,age,employment_status,gender,state,bed_time_weekdays,wake_up_weekdays,bed_time_wrong,wake_up_wrong,time_awake_wrong,...,hispanic,race,bed_time_weekend_wrong,wake_up_weekend_wrong,total_height_inch,bmi,regions,age_bin,bmi_bin,cluster_group
0,1,39,3,Male,CO,14,17,1000.0,2050.0,1050.0,...,2,White/Caucasian,1000.0,2050.0,73.0,20.98,West,30-40,20-25,2.0
1,2,57,1,Female,AL,10,7,900.0,1800.0,900.0,...,2,White/Caucasian,2100.0,2100.0,69.0,34.7,South,50-60,30-35,4.0
2,3,51,1,Male,NV,14,9,1000.0,1850.0,850.0,...,2,White/Caucasian,1000.0,1900.0,74.0,25.03,West,50-60,25-30,6.0
3,3,38,1,Male,FL,15,10,1025.0,1875.0,850.0,...,2,White/Caucasian,1100.0,2000.0,71.0,,South,,,
4,4,60,1,Male,CT,18,8,1100.0,1825.0,725.0,...,2,White/Caucasian,1100.0,2050.0,69.0,22.89,Northeast,50-60,20-25,10.0
5,5,39,1,Female,MA,18,14,1100.0,1975.0,875.0,...,2,White/Caucasian,1150.0,2075.0,66.0,23.24,Northeast,30-40,20-25,1.0
6,6,36,2,Female,LA,18,9,1100.0,1850.0,750.0,...,2,White/Caucasian,1200.0,2100.0,60.0,35.54,South,30-40,35-40,16.0
7,7,44,3,Male,NY,10,15,900.0,2000.0,1100.0,...,2,White/Caucasian,900.0,2000.0,67.0,19.11,Northeast,40-50,15-20,3.0
8,7,34,1,Female,WA,12,12,950.0,1925.0,975.0,...,2,White/Caucasian,950.0,2000.0,69.0,19.93,West,30-40,15-20,16.0
9,8,38,1,Male,NY,20,7,1150.0,1800.0,650.0,...,2,Refused,1201.7,2175.0,70.0,24.39,Northeast,30-40,20-25,14.0


## Initial Cluster Popularity Preview
 To gain insight into which clusters matter the most for each attribute, the initial perspective will be given for each cluster for each attribute.

In [3]:
gender_analysis = (cluster_df
                   .sort_values(['cluster_group'])
                   .groupby('cluster_group')['gender']
                   .describe()
                   .sort_values(['freq'], ascending=False))
# .drop_duplicates('top', keep='first'))
gender_analysis.iloc[:, 2:].head(9)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,Male,103
2.0,Male,85
1.0,Female,80
4.0,Female,66
11.0,Female,53
7.0,Male,48
14.0,Male,47
0.0,Female,43
18.0,Male,43


In [4]:
marital_status_analysis = (cluster_df
                           .sort_values(['cluster_group'])
                           .groupby('cluster_group')['marital_status']
                           .describe()
                           .sort_values(['freq'], ascending=False))
# .drop_duplicates('top', keep='first'))
marital_status_analysis.iloc[:, 2:].head(6)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,married_or_partnered,89
1.0,married_or_partnered,77
2.0,married_or_partnered,77
14.0,single,62
4.0,married_or_partnered,49
6.0,married_or_partnered,48


In [5]:
age_analysis = (cluster_df
                .sort_values(['cluster_group'])
                .groupby('cluster_group')['age_bin']
                .describe()
                .sort_values(['freq'], ascending=False))
age_analysis.iloc[:, 2:].head(6)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,30-40,79
3.0,40-50,63
2.0,30-40,61
6.0,50-60,59
14.0,20-30,57
4.0,50-60,47


In [6]:
bmi_analysis = (cluster_df
                .sort_values(['cluster_group'])
                .groupby('cluster_group')['bmi_bin']
                .describe()
                .sort_values(['freq'], ascending=False))
bmi_analysis.iloc[:, 2:].head(6)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,30-35,87
2.0,20-25,76
1.0,25-30,56
14.0,20-25,54
11.0,20-25,53
7.0,25-30,48


In [7]:
health_rating_analysis = (cluster_df
                          .sort_values(['cluster_group'])
                          .groupby('cluster_group')['health_rating']
                          .describe()
                          .sort_values(['freq'], ascending=False))

health_rating_analysis.iloc[:,2:].head(6)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,Good,99
3.0,Good,84
14.0,Good,56
2.0,Excellent,55
4.0,Good,50
6.0,Good,50


In [8]:
sleep_rating_analysis = (cluster_df
                         .sort_values(['cluster_group'])
                         .groupby('cluster_group')['sleep_rating']
                         .describe()
                         .sort_values(['freq'], ascending=False))
# .drop_duplicates('top', keep='first'))
sleep_rating_analysis.iloc[:,2:].head(6)

Unnamed: 0_level_0,top,freq
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,Fairly Good,88
1.0,Fairly Good,80
2.0,Fairly Good,69
6.0,Fairly Good,57
4.0,Fairly Bad,56
14.0,Fairly Good,53


To streamline to decision making process, a function is made with the design to count the number of times a top cluster is associated with a particular attribute.

In [9]:
x = np.arange(0,20,1)
sum_df = pd.DataFrame(x, columns=['cluster_group'])

In [10]:
sum_df['mentioned'] = [0]*20

In [11]:
index_list = [bmi_analysis.index, age_analysis.index,
              health_rating_analysis.index, sleep_rating_analysis.index,
              marital_status_analysis.index, gender_analysis.index]
def count_cluster_relative_size(sum_df, index_list):
    for mention in index_list:
        i=0

        for m in mention:
            i+=1
            if i>5:
                break
            else:
                sum_df['mentioned'][m] += 1
    return sum_df

In [12]:
sum_df = count_cluster_relative_size(sum_df, index_list)

In [13]:

sum_df = sum_df.sort_values(['mentioned'], ascending=False).head(6)

In [14]:
sum_df.index

Int64Index([2, 3, 1, 4, 14, 6], dtype='int64')

##  Remove Irrelevant Clusters
 With a 'mentions' column now added, the bottom 16 clusters can be removed, just leaving the most pertinent clusters for our analysis.

In [15]:
def clean_unneeded_clusters(x, index):
    if x == index[0]:
        return x
    elif x == index[1]:
        return x
    elif x == index[2]:
        return x
    elif x == index[3]:
        return x
    elif x == index[4]:
        return x
    elif x == index[5]:
        return x
    else:
        return np.nan

In [16]:
cluster_df['cluster_group'] = cluster_df['cluster_group'].apply(lambda x: clean_unneeded_clusters(x, sum_df.index))

In [17]:
cluster_df_copy = cluster_df.copy()

In [18]:
cluster_df.dropna(subset=['cluster_group'], inplace=True)

In [19]:
cluster_df

Unnamed: 0,id,age,employment_status,gender,state,bed_time_weekdays,wake_up_weekdays,bed_time_wrong,wake_up_wrong,time_awake_wrong,...,hispanic,race,bed_time_weekend_wrong,wake_up_weekend_wrong,total_height_inch,bmi,regions,age_bin,bmi_bin,cluster_group
0,1,39,3,Male,CO,14,17,1000.0,2050.0,1050.0,...,2,White/Caucasian,1000.0,2050.0,73.0,20.98,West,30-40,20-25,2.0
1,2,57,1,Female,AL,10,7,900.0,1800.0,900.0,...,2,White/Caucasian,2100.0,2100.0,69.0,34.70,South,50-60,30-35,4.0
2,3,51,1,Male,NV,14,9,1000.0,1850.0,850.0,...,2,White/Caucasian,1000.0,1900.0,74.0,25.03,West,50-60,25-30,6.0
5,5,39,1,Female,MA,18,14,1100.0,1975.0,875.0,...,2,White/Caucasian,1150.0,2075.0,66.0,23.24,Northeast,30-40,20-25,1.0
7,7,44,3,Male,NY,10,15,900.0,2000.0,1100.0,...,2,White/Caucasian,900.0,2000.0,67.0,19.11,Northeast,40-50,15-20,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,944,38,1,Female,NY,14,7,1000.0,1800.0,800.0,...,2,White/Caucasian,1100.0,2000.0,64.0,32.61,Northeast,30-40,30-35,4.0
992,949,36,1,Female,NY,18,11,1100.0,1900.0,800.0,...,2,White/Caucasian,1100.0,2000.0,63.0,25.33,Northeast,30-40,25-30,1.0
993,951,34,2,Female,NJ,16,11,1050.0,1900.0,850.0,...,2,White/Caucasian,1100.0,1900.0,61.0,21.73,Northeast,30-40,20-25,2.0
996,959,26,1,Male,NY,2,13,1201.7,1950.0,748.3,...,2,White/Caucasian,1400.0,2200.0,63.0,23.03,Northeast,20-30,20-25,2.0


In [20]:
cluster_df.groupby('cluster_group')['cluster_group'].count()

cluster_group
1.0     103
2.0      96
3.0     117
4.0      72
6.0      59
14.0     71
Name: cluster_group, dtype: int64

In [21]:
health_subset = health_rating_analysis.T[sum_df.index]
sleep_subset = sleep_rating_analysis.T[sum_df.index]
bmi_subset = bmi_analysis.T[sum_df.index]
age_subset = age_analysis.T[sum_df.index]
marital_subset = marital_status_analysis.T[sum_df.index]
gender_subset = gender_analysis.T[sum_df.index]

In [22]:
cluster_population_count = cluster_df.groupby('cluster_group')['cluster_group'].count()
cluster_population_count.index.name = None
cluster_population_count.rename('Cluster Count', inplace=True)
cluster_population_count = cluster_population_count.reindex(sum_df.index)
cluster_population_count

2      96
3     117
1     103
4      72
14     71
6      59
Name: Cluster Count, dtype: int64

In [23]:
sum_df['health'] = health_subset.T['top']
sum_df['sleep'] = sleep_subset.T['top']
sum_df['bmi'] = bmi_subset.T['top']
sum_df['age'] = age_subset.T['top']
sum_df['marital_status'] = marital_subset.T['top']
sum_df['gender'] = gender_subset.T['top']
sum_df['cluster_count'] = cluster_population_count

In [24]:
sum_df

Unnamed: 0,cluster_group,mentioned,health,sleep,bmi,age,marital_status,gender,cluster_count
2,2,6,Excellent,Fairly Good,20-25,30-40,married_or_partnered,Male,96
3,3,6,Good,Fairly Good,30-35,40-50,married_or_partnered,Male,117
1,1,6,Good,Fairly Good,25-30,30-40,married_or_partnered,Female,103
4,4,4,Good,Fairly Bad,30-35,50-60,married_or_partnered,Female,72
14,14,4,Good,Fairly Good,20-25,20-30,single,Male,71
6,6,2,Good,Fairly Good,25-30,50-60,married_or_partnered,Female,59


In [25]:
sum_df.iloc[:,2:].sort_index(ascending=False).head(7)

Unnamed: 0,health,sleep,bmi,age,marital_status,gender,cluster_count
14,Good,Fairly Good,20-25,20-30,single,Male,71
6,Good,Fairly Good,25-30,50-60,married_or_partnered,Female,59
4,Good,Fairly Bad,30-35,50-60,married_or_partnered,Female,72
3,Good,Fairly Good,30-35,40-50,married_or_partnered,Male,117
2,Excellent,Fairly Good,20-25,30-40,married_or_partnered,Male,96
1,Good,Fairly Good,25-30,30-40,married_or_partnered,Female,103


In [26]:
sum_df.iloc[:,2:].sort_index(ascending=False).head(7).to_latex('../data_file/latex_dataframes/cluster_descriptions.tex', index = True)
sum_df.iloc[:, 2:].to_pickle('../data_file/cleaned_data_pickle/cluster_descriptions.pickle')

In [27]:
cluster_df_copy.to_csv(r'../data_file/cleaned_data_csv/cluster_oriented_2013_poll.csv', index = False)
cluster_df_copy.to_pickle('../data_file/cleaned_data_pickle/cluster_oriented_2013_poll.pickle')