# AAA Northeast Customer Clustering

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../project-AAA-northeast-member-clustering/data/processed_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3511 entries, 0 to 3510
Columns: 237 entries, Household Key to x5_Young City Solos
dtypes: float64(236), int64(1)
memory usage: 6.3 MB


In [3]:
df.columns[:30]

Index(['Household Key', 'ERS Member Cost Year 3 sum',
       'ERS Member Cost Year 3 mean', 'Length Of Residence mean',
       'Number of Children mean', 'ERS Member Cost Year 2 sum',
       'ERS Member Cost Year 2 mean', 'Cost 2014 sum', 'Cost 2016 sum',
       'ERS ENT Count Year 3 sum', 'ERS ENT Count Year 3 mean',
       'Mail Responder mean', 'Cost 2015 sum', 'PrimaryMember sum',
       'ERS ENT Count Year 1 sum', 'ERS ENT Count Year 1 mean',
       'Email Available mean', 'Income mean', 'ERS Member Cost Year 1 sum',
       'ERS Member Cost Year 1 mean', 'Cost 2019 sum', 'Credit Ranges mean',
       'Member Tenure Years mean', 'AssociateMember sum', 'Cost 2018 sum',
       'Total Cost sum', 'Member Key count', 'Do Not Direct Mail Solicit mean',
       'Cost 2017 sum', 'ERS ENT Count Year 2 sum'],
      dtype='object')

## Data Exploration
### Product Usage
Only a small portion of households using the products. The most used product has lower than 30% of the usage.

In [4]:
product_list = ['FSV CMSI Flag', 'FSV Credit Card Flag', 'FSV Deposit Program Flag', 
                'FSV Home Equity Flag', 'FSV ID Theft Flag', 'FSV Mortgage Flag',
                'INS Client Flag', 'TRV Globalware Flag', 'New Mover Flag']

In [5]:
df_product_pct = pd.DataFrame({'Percent of Usage': df[product_list].mean()}
                             ).join(pd.DataFrame({'Count of Usage': df[product_list].sum()}))
df_product_pct.sort_values(['Percent of Usage'], ascending = False)

Unnamed: 0,Percent of Usage,Count of Usage
INS Client Flag,0.281686,989.0
TRV Globalware Flag,0.179151,629.0
FSV Credit Card Flag,0.136144,478.0
FSV CMSI Flag,0.082598,290.0
FSV ID Theft Flag,0.058103,204.0
New Mover Flag,0.0544,191.0
FSV Deposit Program Flag,0.006551,23.0
FSV Mortgage Flag,0.003133,11.0
FSV Home Equity Flag,0.000854,3.0


Around 45% of the households do not use any products from AAA Northeast.

In [6]:
# Number of products bought by household
df[product_list].sum(axis=1).value_counts()

0.0    1589
1.0    1220
2.0     539
3.0     135
4.0      25
5.0       3
dtype: int64

In [7]:
1589/3511

0.4525776132156081

## Predictive Analysis
We could use classification models to predict the probability of an household buying a product on the products with more than 1% of household usage. The models build on these products should have be better in generalization than other low-usage products.

For all products with over 5% user, apply the following process for modeling:
- Upsampling
- Create training & test set based on upsampled dataset
- Grid search using Decision Tree (Set the max of depth to 10 instead of None to avoid overfitting)
- Apply bagging with 100 estimators using decision tree and best parameters

In [8]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [179]:
%%time
product_list_to_predict = ['FSV CMSI Flag', 'FSV Credit Card Flag', 'FSV ID Theft Flag',
                           'INS Client Flag', 'TRV Globalware Flag', 'New Mover Flag']
df_prob = pd.DataFrame()
for prod in product_list_to_predict:
    # Up Sampling
    # Separate majority and minority classes
    df_majority = df[df[prod]==0]
    df_minority = df[df[prod]==1]

    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_majority),    # to match majority class
                                     random_state=72) # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Display new class counts
    print('====', prod, '====')
    print(df_upsampled[prod].value_counts())
    
    # X - exclude total costs as it will be used for clustering
    cols = [x for x in df_upsampled.columns if (x not in product_list
                                               ) & (x not in ['Household Key', 'Total Cost sum'])]
    X = df_upsampled[cols]

    # y
    y = df_upsampled[prod]
        
    # Training and testing sets    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=75)
    
    # Train Decision Tree
    param_grid = {'criterion': ['gini', 'entropy'],
                  'max_depth': [8, 10, 12],
                  'min_samples_split': [2, 3, 4]}
    
    gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
    gs_dt.fit(X_train, y_train)
    print("\n== Decision Tree ==")
    print("Best Parameters:", gs_dt.best_params_)
    print("Accuracy on Training Set:", gs_dt.best_score_)

    y_pred_prob = gs_dt.predict_proba(X_test)[:,1]
    print("Accuracy on Test Set:", gs_dt.score(X_test, y_test), 
          "AUC:", roc_auc_score(y_test, y_pred_prob))
    
    y_pred_prob = gs_dt.predict_proba(df[cols])[:,1]
    print("Accuracy on Original Dataset:", gs_dt.score(df[cols], df[prod]), 
          "AUC:", roc_auc_score(df[prod], y_pred_prob))
    
    # Train Bagging 
    dt = DecisionTreeClassifier(criterion=gs_dt.best_estimator_.criterion,
                                max_depth=gs_dt.best_estimator_.max_depth,
                                min_samples_split=gs_dt.best_estimator_.min_samples_split)
    bg =  BaggingClassifier(dt, n_estimators = 100)
    bg.fit(X_train, y_train)
    print("\n== Bagging ==")
    print("Accuracy on Training Set:", bg.score(X_train, y_train))

    y_pred_prob = bg.predict_proba(X_test)[:,1]
    print("Accuracy on Test Set:", bg.score(X_test, y_test),
          "AUC:", roc_auc_score(y_test, y_pred_prob))
    
    y_pred_prob = bg.predict_proba(df[cols])[:,1]
    print("Accuracy on Original Dataset:", bg.score(df[cols], df[prod]),
          "AUC:", roc_auc_score(df[prod], y_pred_prob))
    
    y_pred = bg.predict(df[cols])
    print("Prediction Count:")
    print(pd.Series(y_pred).value_counts(), "\n")
    
    df_prob[prod] = y_pred_prob

df_prob.head()

==== FSV CMSI Flag ====
1.0    3221
0.0    3221
Name: FSV CMSI Flag, dtype: int64

== Decision Tree ==
Best Parameters: {'max_depth': 12, 'criterion': 'gini', 'min_samples_split': 3}
Accuracy on Training Set: 0.8540685355908583
Accuracy on Test Set: 0.8634600465477114 AUC: 0.9166642592325099
Accuracy on Original Dataset: 0.8174309313585872 AUC: 0.9451091436585339

== Bagging ==
Accuracy on Training Set: 0.9683679410052397
Accuracy on Test Set: 0.9317300232738557 AUC: 0.9901247050893158
Accuracy on Original Dataset: 0.9547137567644546 AUC: 0.9922587759209498
Prediction Count:
0.0    3086
1.0     425
dtype: int64 

==== FSV Credit Card Flag ====
1.0    3033
0.0    3033
Name: FSV Credit Card Flag, dtype: int64

== Decision Tree ==
Best Parameters: {'max_depth': 12, 'criterion': 'gini', 'min_samples_split': 2}
Accuracy on Training Set: 0.744233280601357
Accuracy on Test Set: 0.7413509060955519 AUC: 0.7980548189844456
Accuracy on Original Dataset: 0.750783252634577 AUC: 0.8631831582025888



**With high AUC on predicting all products in the original dataset, we can be confident that the probability of purchsing would be a good reference on current and potential buyers of each product.**

## Recommendation System
For the products with usage less than 5%, we can further use recommendation system to generate the probability of buying, based on the probabilities of top 3 models.

In [173]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

ImportError: No module named 'surprise'

## Predict Cost in Next 12 Months
Now we have the year cost, we can use the cost in year n-1 to predict the cost in n. First we have to rearrange the yearly cost columns to only two columns, cost last year and cost this year. Eventually, we can use cost in 2019 to predict cost in 2020.

If the prediction result is not ideal, try using year n-2 and year n-1 to predict year n.

In [174]:
df_cost_prediction = df.drop(['Cost 2014 sum', 'Cost 2015 sum', 'Cost 2016 sum', 
                              'Cost 2017 sum', 'Cost 2018 sum', 'Total Cost sum', 
                              'ERS ENT Count Year 1 sum', 'ERS ENT Count Year 1 mean',
                              'ERS ENT Count Year 2 mean', 'ERS ENT Count Year 3 mean', 
                              'ERS Member Cost Year 1 sum', 'ERS Member Cost Year 1 mean',
                              'ERS Member Cost Year 2 mean', 'ERS Member Cost Year 3 mean'] + product_list, axis = 1)
    # Categorical variables
cat_cols = [x for x in df_cost_prediction.columns if x.startswith('x')]
for col in cat_cols:
    df_cost_prediction[col] = df_cost_prediction[col].astype('category')
df_cost_prediction.iloc[:, :40].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3511 entries, 875 to 100079136
Data columns (total 40 columns):
ERS Member Cost Year 3 sum         3511 non-null float64
Length Of Residence mean           3511 non-null float64
Number of Children mean            3511 non-null float64
ERS Member Cost Year 2 sum         3511 non-null float64
ERS ENT Count Year 3 sum           3511 non-null float64
Mail Responder mean                3511 non-null float64
PrimaryMember sum                  3511 non-null float64
Email Available mean               3511 non-null float64
Income mean                        3511 non-null float64
Cost 2019 sum                      3511 non-null float64
Credit Ranges mean                 3511 non-null float64
Member Tenure Years mean           3511 non-null float64
AssociateMember sum                3511 non-null float64
Member Key count                   3511 non-null float64
Do Not Direct Mail Solicit mean    3511 non-null float64
ERS ENT Count Year 2 sum       

In [157]:
df_cost_prediction['Cost 2019 sum'].describe()

count    3511.000000
mean       29.002273
std        55.264674
min         0.000000
25%         0.000000
50%         0.000000
75%        58.850000
max       584.170000
Name: Cost 2019 sum, dtype: float64

In [105]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [175]:
X = df_cost_prediction.drop(['Cost 2019 sum'], axis=1)
y = df_cost_prediction[['Cost 2019 sum']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 75)

In [177]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("R-squared:", lr.score(X_test, y_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.3687712590419572
RMSE: 44.99548182166654


In [124]:
# Suppress warnings from using Lasso
import warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [178]:
%%time
param_grid = {'alpha': np.linspace(0, 10, 11)}
lasso = Lasso(normalize=True)
gs_lasso = GridSearchCV(lasso, param_grid, cv=5)
gs_lasso.fit(X_train, y_train)
y_pred = gs_lasso.predict(X_test)

print("R-squared:", gs_lasso.score(X_test, y_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.36705000867496
RMSE: 45.056787462013325
Wall time: 56.6 s


In [103]:
from sklearn.ensemble import BaggingRegressor

In [118]:
%%time
br = BaggingRegressor(Lasso(), n_estimators=100)
br.fit(X_train, np.ravel(y_train))
y_pred = br.predict(X_test)

print("R-squared:", br.score(X_test, np.ravel(y_test)))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.6807803324681604
RMSE: 31.512361627559592
Wall time: 38.8 s


## Clustering

## Appendix

In [None]:
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
# Scale 

#### kNN

In [16]:
%%time
param_grid = {'n_neighbors': np.arange(8, 20)}
gs_kNN = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
gs_kNN.fit(X_train, y_train)
print("Best Number of Neighbors:", gs_kNN.best_params_)
print("Accuracy on Training Set:", gs_kNN.best_score_)

y_pred_prob = gs_kNN.predict_proba(X_test)[:,1]
print("Accuracy on Test Set:", gs_kNN.score(X_test, y_test))
print("AUC:", roc_auc_score(y_test, y_pred_prob))

Best Number of Neighbors: {'n_neighbors': 8}
Accuracy on Training Set: 0.584634448574969
Accuracy on Test Set: 0.5966303270564915
AUC: 0.6398888103096024
Wall time: 1min 49s


#### Logistic Regression

In [25]:
%%time
param_grid = {'C': [15-6, 1e-5, 1e-4], 
              'tol': [10, 1, 0.1], 
              'max_iter': [1000, 1500]}
gs_lr = GridSearchCV(LogisticRegression(), param_grid, cv=5)
gs_lr.fit(X_train, y_train)
print("Best Parameters:", gs_lr.best_params_)
print("Accuracy on Training Set:", gs_lr.best_score_)

y_pred_prob = gs_lr.predict_proba(X_test)[:,1]
print("Accuracy on Test Set:", gs_lr.score(X_test, y_test))
print("AUC:", roc_auc_score(y_test, y_pred_prob))

Best Parameters: {'tol': 10, 'C': 9, 'max_iter': 1000}
Accuracy on Training Set: 0.5861214374225526
Accuracy on Test Set: 0.5827552031714569
AUC: 0.6002868143957252
Wall time: 17.6 s


#### SVM
SVM takes too much time to train.

#### Random Forest

In [19]:
%%time
param_grid = {'n_estimators':[50, 100, 150],
              'max_depth': [3, 5], 
              'max_features': ['auto', 'sqrt']}
rf = RandomForestClassifier()
gs_rf = GridSearchCV(rf, param_grid, cv=5)
gs_rf.fit(X_train, np.ravel(y_train))
print("Best Parameters:", gs_rf.best_params_)
print("Accuracy on Training Set:", gs_rf.best_score_)

y_pred_prob = gs_rf.predict_proba(X_test)[:,1]
print("Accuracy on Test Set:", gs_rf.score(X_test, y_test))
print("AUC:", roc_auc_score(y_test, y_pred_prob))

Best Parameters: {'max_depth': 5, 'n_estimators': 150, 'max_features': 'sqrt'}
Accuracy on Training Set: 0.6042131350681537
Accuracy on Test Set: 0.6214073339940536
AUC: 0.6567735344963066
Wall time: 1min 25s


In [20]:
y_pred = gs_rf.predict(X_test)
pd.Series(y_pred).value_counts()

0.0    565
1.0    444
dtype: int64