In [1]:
import pandas as pd

In [None]:
# Load in the general demographics data.
azdias = pd.read_csv('Udacity_AZDIAS_Subset.csv', sep = ';')

# Load in the feature summary file.
feat_info = pd.read_csv('AZDIAS_Feature_Summary.csv', sep = ';')
print(feat_info)

In [None]:
# turn missing_or_unknown to list 
feat_info['missing_or_unknown'] = feat_info['missing_or_unknown'].apply(lambda x: x[1:-1].split(','))

# Identify missing or unknown data values and convert them to NaNs.
for attrib, missing_values in zip(feat_info['attribute'], feat_info['missing_or_unknown']):
    if missing_values[0] != '':
        for value in missing_values:
            if value.isnumeric() or value.lstrip('-').isnumeric():
                value = int(value)
            azdias.loc[azdias[attrib] == value, attrib] = np.nan

In [None]:
# Perform an assessment of how much missing data there is in each column of the
# dataset.
missing_data = pd.Series(azdias.isnull().sum() / len(azdias))
missing_data.plot(kind='barh', figsize=(10, 20))
missing_data[missing_data > 0.2].index.tolist()
['AGER_TYP',
 'GEBURTSJAHR',
 'TITEL_KZ',
 'ALTER_HH',
 'KK_KUNDENTYP',
 'KBA05_BAUMAX']
# Remove the outlier columns from the dataset. (You'll perform other data engineering tasks such as re-encoding and imputation later.)
azdias = azdias.drop(['AGER_TYP','GEBURTSJAHR','TITEL_KZ','ALTER_HH','KK_KUNDENTYP','KBA05_BAUMAX'], axis = 1)

In [None]:
# How much data is missing in each row of the dataset?
missing_data_rows = azdias.isnull().sum(axis = 1)
missing_data_rows_low = azdias[azdias.isnull().sum(axis=1) < 10].reset_index(drop=True)

missing_data_rows_high = azdias[azdias.isnull().sum(axis = 1) >= 10].reset_index(drop=True)
def countplot(columns, num):
    fig, axs = plt.subplots(num, 2, figsize=(15, 15))
    fig.subplots_adjust(hspace =2 , wspace=.2)
    axs = axs.ravel()
    
    for i in range(num):
        
        sns.countplot(missing_data_rows_low[columns[i]], ax=axs[i*2])
        axs[i*2].set_title('missing_data_rows_low')
        sns.countplot(missing_data_rows_high[columns[i]], ax=axs[i*2+1])
        axs[i*2+1].set_title('missing_data_rows_high')
        
countplot(missing_data_rows_high.columns, 3)

In [None]:
def clean_data(df):
    """
    Perform feature trimming, re-encoding, and engineering for demographics
    data
    
    INPUT: Demographics DataFrame
    OUTPUT: Trimmed and cleaned demographics DataFrame
    """
    
    # Put in code here to execute all main cleaning steps:
    # convert missing value codes into NaNs, ...
    
    df_copy = df.copy()

# Identify missing or unknown data values and convert them to NaNs.
    for col_name in df.columns:
        df_copy[col_name] = df_copy[col_name].map(lambda x: np.nan if str(x) in feat_info.loc[col_name].missing_or_unknown else x)
        
    # remove selected columns and rows, ...
    c_removed =['AGER_TYP','GEBURTSJAHR','TITEL_KZ','ALTER_HH','KK_KUNDENTYP','KBA05_BAUMAX']
    
    for c in c_removed:
        df_copy.drop(c, axis=1, inplace=True)
        
    df_copy = df_copy[df_copy.isnull().sum(axis=1) < 10].reset_index(drop=True)
    
    for col in df_copy.columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0])
        
    
    # select, re-encode, and engineer column values.
    multi_level = []
    for column in df_copy.columns:
        if feat_info.loc[column].type == 'categorical' and len(df_copy[column].unique()) > 2:
            multi_level.append(column)
            
    for col in multi_level:
        df_copy.drop(col, axis=1, inplace=True)
    
    df_copy['decade'] = df_copy['PRAEGENDE_JUGENDJAHRE'].apply(create_interval_decade)
    df_copy['movement'] = df_copy['PRAEGENDE_JUGENDJAHRE'].apply(create_binary_movement)
    df_copy.drop('PRAEGENDE_JUGENDJAHRE', axis=1, inplace=True)
    
    df_copy['wealth'] = df_copy['CAMEO_INTL_2015'].apply(wealth)
    df_copy['life_stage'] = df_copy['CAMEO_INTL_2015'].apply(life_stage)
    df_copy.drop('CAMEO_INTL_2015', axis=1, inplace=True)
    
    df_copy = pd.get_dummies(data=df_copy, columns=['OST_WEST_KZ'])
    
    mixed = ['LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB','WOHNLAGE','PLZ8_BAUMAX']
    
    for c in mixed:
        df_copy.drop(c, axis=1, inplace=True)
    
     # Return the cleaned dataframe.
    return df_copy

## Feature transformation

In [None]:
# Fill the Nan values with the mode of that respective column.
for col in missing_data_rows_low.columns:
        missing_data_rows_low[col] = missing_data_rows_low[col].fillna(missing_data_rows_low[col].mode()[0])
# Apply feature scaling to the general population demographics data.
normalizer = StandardScaler()
missing_data_rows_low[missing_data_rows_low.columns] = normalizer.fit_transform(missing_data_rows_low[missing_data_rows_low.columns])
missing_data_rows_low.head()

## Dimensionality reduction

In [None]:
# Apply PCA to the data.

pca = PCA()
missing_data_rows_low_pca = pca.fit_transform(missing_data_rows_low)
# Investigate the variance accounted for by each principal component.
def scree_plot(pca):
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instantian of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components = len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(10, 6))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    

scree_plot(pca)

In [None]:
# Re-apply PCA to the data while selecting for number of components to retain.

pca = PCA(n_components=41)
missing_data_rows_low_pca = pca.fit_transform(missing_data_rows_low)

In [None]:
def get_kmeans_score(data, center):
    '''
    returns the kmeans score regarding SSE for points to centers
    INPUT:
        data - the dataset you want to fit kmeans to
        center - the number of centers you want (the k value)
    OUTPUT:
        score - the SSE score for the kmeans model fit to the data
    '''
    #instantiate kmeans
    kmeans = KMeans(n_clusters=center)

    # Then fit the model to your data using the fit method
    model = kmeans.fit(data)
    
    # Obtain a score related to the model fit
    score = np.abs(model.score(data))
    
    return score
# Over a number of different cluster counts...
# run k-means clustering on the data and...
# compute the average within-cluster distances.
scores = []
centers = list(range(1,30,3))

for center in centers:
    scores.append(get_kmeans_score(missing_data_rows_low_pca, center))
# Investigate the change in within-cluster distance across number of clusters.
# HINT: Use matplotlib's plot function to visualize this relationship.
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('SSE');
plt.title('SSE vs. K')

In [None]:
# Re-fit the k-means model with the selected number of clusters and obtain
# cluster predictions for the general population demographics data.

# Re-fit the k-means model with the selected number of clusters and obtain
# cluster predictions for the general population demographics data.

kmeans = KMeans(n_clusters=22)
model_general = kmeans.fit(missing_data_rows_low_pca)
predict_general = model_general.predict(missing_data_rows_low_pca)

In [None]:
# What kinds of people are part of a cluster that is overrepresented in the
# customer data compared to the general population?
over = normalizer.inverse_transform(pca.inverse_transform(customers_clean_pca[np.where(predict_customers==11)])).round()
df_over = pd.DataFrame(data = over, columns = customers_clean.columns)
df_over.head(10)

In [None]:
# What kinds of people are part of a cluster that is underrepresented in the
# customer data compared to the general population?
under = normalizer.inverse_transform(pca.inverse_transform(customers_clean_pca[np.where(predict_customers==16)])).round()
df_under = pd.DataFrame(data=under, columns=customers_clean.columns)
df_under.head(10)
