In [313]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

# Marketing Campaign Dataset

In [251]:
marketing_df = pd.read_csv("../data/marketing_campaign.csv", sep='\t')

marketing_df.to_csv("../data/marketing_campaign_legible.csv")

marketing_df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,...,3,0,0,0,0,0,0,3,11,0


## Cleaning the Data

In [253]:
# Noted that the 'Marital_Status' has some unknown or strange values. Alone and Single are duplicates.
clean_df = marketing_df.copy()

clean_df['Marital_Status'].value_counts()

Married     864
Together    580
Single      480
Divorced    232
Widow        77
Alone         3
YOLO          2
Absurd        2
Name: Marital_Status, dtype: int64

In [254]:
clean_df['Marital_Status'] = clean_df['Marital_Status'].map({'Married':'Married','Together':'Together','Single':'Single','Divorced':'Divorced','Widow':'Widow','Alone': 'Single', 'YOLO':'Unknown', 'Absurd':'Unknown'})

clean_df['Marital_Status'].value_counts()

Married     864
Together    580
Single      483
Divorced    232
Widow        77
Unknown       4
Name: Marital_Status, dtype: int64

In [255]:
# Noted income has 24 null values. Will put average of the incomes to replace the empty cells.

print(clean_df['Income'].isnull().sum())
clean_df = clean_df.fillna(clean_df['Income'].mean())
print(clean_df['Income'].isnull().sum())

24
0


In [256]:
clean_df[marketing_target].value_counts()

0    1158
1    1030
2      52
Name: Teenhome, dtype: int64

In [257]:
# Converting Teenhome to (0,1) for the purpose of this assignment.

clean_df[marketing_target] = clean_df[marketing_target].replace([1,2], 1)

clean_df[marketing_target].value_counts()

0    1158
1    1082
Name: Teenhome, dtype: int64

In [258]:
# Converting Dt_customer to date since the earliest date to make algorithms easier to use.

first = datetime(2012,7,30,0,0,0)

temp = []

for value in clean_df['Dt_Customer']:
    date = value.split('-')
    this = datetime(int(date[2]), int(date[1]), int(date[0]), 0, 0, 0)
    temp.append((this-first).days)
    
# print(temp)

clean_df['Dt_Customer'] = temp

In [271]:
# Z_CostContact and Z_Revenue never change, therefore they are not good features for prediction.
clean_df = clean_df.drop(columns=['Z_CostContact','Z_Revenue'])

In [275]:
clean_df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,5524,1957,Graduation,Single,58138.0,0,0,36,58,635,...,10,4,7,0,0,0,0,0,0,1
1,2174,1954,Graduation,Single,46344.0,1,1,586,38,11,...,1,2,5,0,0,0,0,0,0,0
2,4141,1965,Graduation,Together,71613.0,0,0,387,26,426,...,2,10,4,0,0,0,0,0,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,560,26,11,...,0,4,6,0,0,0,0,0,0,0
4,5324,1981,PhD,Married,58293.0,1,0,538,94,173,...,3,6,5,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,318,46,709,...,3,4,5,0,0,0,0,0,0,0
2236,4001,1946,PhD,Together,64014.0,2,1,680,56,406,...,2,5,7,0,0,0,1,0,0,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,544,91,908,...,3,13,6,0,1,0,0,0,0,0
2238,8235,1956,Master,Together,69245.0,0,1,543,8,428,...,5,10,3,0,0,0,0,0,0,0


## Preprocessing

### Feature Selection

In [276]:
# Continuous features
marketing_continuous_features = ['Income','MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                                'MntSweetProducts', 'MntGoldProds']

# Categorical features (not including response)
marketing_categorical_features = ['ID','Education','Marital_Status','Kidhome','AcceptedCmp3', 
                                 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2','Complain',
                                 'Response']

# Discrete features
marketing_discrete_features = ['Year_Birth','Recency','NumDealsPurchases','NumWebPurchases',
                               'NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Dt_Customer']

# Teenhome
marketing_target = "Teenhome"

print(len(marketing_continuous_features)+len(marketing_categorical_features)+len(marketing_discrete_features)+1)

27


In [277]:
featureselection_df = clean_df.copy()
ordEnc = OrdinalEncoder()

featureselection_df[marketing_categorical_features] = ordEnc.fit(featureselection_df[marketing_categorical_features]).transform(featureselection_df[marketing_categorical_features])

In [281]:
cor = featureselection_df.corr()

#Correlation with output variable
cor_target = abs(corr[marketing_target])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.15]
relevant_features

Year_Birth           0.357522
Teenhome             1.000000
MntFruits            0.182316
MntMeatProducts      0.275840
MntFishProducts      0.210868
MntSweetProducts     0.164644
NumDealsPurchases    0.394173
NumWebPurchases      0.157645
AcceptedCmp5         0.205445
Response             0.161356
Name: Teenhome, dtype: float64

In [279]:
X = featureselection_df.drop(columns=[marketing_target])
y = featureselection_df[marketing_target].values

for i in range (27):
    print("_______________________________________________________________________________________Best "+str(i)+":")
    # Create and fit selector
    selector = SelectKBest(chi2, k=i)
    selector.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = selector.get_support(indices=True)
    features_df_new = X.iloc[:,cols]
    print(features_df_new.columns)

_______________________________________________________________________________________Best 0:
Index([], dtype='object')
_______________________________________________________________________________________Best 1:
Index(['MntMeatProducts'], dtype='object')
_______________________________________________________________________________________Best 2:
Index(['MntMeatProducts', 'MntFishProducts'], dtype='object')
_______________________________________________________________________________________Best 3:
Index(['Income', 'MntMeatProducts', 'MntFishProducts'], dtype='object')
_______________________________________________________________________________________Best 4:
Index(['Income', 'MntFruits', 'MntMeatProducts', 'MntFishProducts'], dtype='object')
_______________________________________________________________________________________Best 5:
Index(['Income', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
       'MntSweetProducts'],
      dtype='object')
________________________

In [283]:
best_features = ['MntMeatProducts','MntFishProducts','Income','MntFruits','MntSweetProducts','NumDealsPurchases','Dt_Customer',
                'NumWebPurchases','NumCatalogPurchases','AcceptedCmp5','MntGoldProds','NumWebVisitsMonth','Response','AcceptedCmp1',
                'Year_Birth','Education','NumStorePurchases','MntWines','ID','AcceptedCmp3','AcceptedCmp4','Kidhome',
                'AcceptedCmp2','Recency','Marital_Status','Complain']

best_categorical_features = np.intersect1d(best_features, marketing_categorical_features)

In [290]:
featureselection_df[best_categorical_features] = ordEnc.fit(featureselection_df[best_categorical_features]).transform(featureselection_df[best_categorical_features])
featureselection_df[marketing_categorical_features] = featureselection_df[marketing_categorical_features].astype('category') 

X = featureselection_df[best_features]
y = featureselection_df[marketing_target].values

temp = []

for feature in best_features:
    temp.append(feature)
    print("________________________________________________")
    print(temp)
    X_train, X_test, y_train, y_test = train_test_split(X[temp], y, test_size=0.3, stratify=y)
    model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)
    model.fit(X_train, y_train)
    y_preds_rf = model.predict(X_test)
    ac = accuracy_score(y_test, y_preds_rf)
    print('Overall Accuracy: %.9f'%ac)

________________________________________________
['MntMeatProducts']
Overall Accuracy: 0.629464286
________________________________________________
['MntMeatProducts', 'MntFishProducts']
Overall Accuracy: 0.662202381
________________________________________________
['MntMeatProducts', 'MntFishProducts', 'Income']
Overall Accuracy: 0.705357143
________________________________________________
['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits']
Overall Accuracy: 0.755952381
________________________________________________
['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits', 'MntSweetProducts']
Overall Accuracy: 0.747023810
________________________________________________
['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits', 'MntSweetProducts', 'NumDealsPurchases']
Overall Accuracy: 0.802083333
________________________________________________
['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits', 'MntSweetProducts', 'NumDealsPurchases', 'Dt_Customer']
Over

In [300]:
final_best_features = ['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits', 'MntSweetProducts', 
                       'NumDealsPurchases', 'Dt_Customer', 'NumWebPurchases', 'NumCatalogPurchases',
                       'AcceptedCmp5', 'MntGoldProds', 'NumWebVisitsMonth', 'Response', 'AcceptedCmp1', 
                       'Year_Birth', 'Education']

featureselection_df = clean_df.copy()
featureselection_df = featureselection_df[final_best_features]
featureselection_df[marketing_target] = clean_df[marketing_target]

featureselection_df

Unnamed: 0,MntMeatProducts,MntFishProducts,Income,MntFruits,MntSweetProducts,NumDealsPurchases,Dt_Customer,NumWebPurchases,NumCatalogPurchases,AcceptedCmp5,MntGoldProds,NumWebVisitsMonth,Response,AcceptedCmp1,Year_Birth,Education,Teenhome
0,546,172,58138.0,88,88,3,36,8,10,0,88,7,1,0,1957,Graduation,0
1,6,2,46344.0,1,1,2,586,1,1,0,6,5,0,0,1954,Graduation,1
2,127,111,71613.0,49,21,1,387,8,2,0,42,4,0,0,1965,Graduation,0
3,20,10,26646.0,4,3,2,560,2,0,0,5,6,0,0,1984,Graduation,0
4,118,46,58293.0,43,27,5,538,5,3,0,15,5,0,0,1981,PhD,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,182,42,61223.0,43,118,2,318,9,3,0,247,5,0,0,1967,Graduation,1
2236,30,0,64014.0,0,0,7,680,8,2,0,8,7,0,1,1946,PhD,1
2237,217,32,56981.0,48,12,1,544,2,3,0,24,6,0,0,1981,Graduation,0
2238,214,80,69245.0,30,30,2,543,6,5,0,61,3,0,0,1956,Master,1


### Feature Transformation

In [311]:
featureEngineering_df = featureselection_df.copy()

final_best_features = ['MntMeatProducts', 'MntFishProducts', 'Income', 'MntFruits', 'MntSweetProducts', 
                       'NumDealsPurchases', 'Dt_Customer', 'NumWebPurchases', 'NumCatalogPurchases',
                       'AcceptedCmp5', 'MntGoldProds', 'NumWebVisitsMonth', 'Response', 'AcceptedCmp1', 
                       'Year_Birth', 'Education']

# Continuous features
continuous_features = ['Income', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                                'MntSweetProducts', 'MntGoldProds','Response']

categorical_features = ['Education','AcceptedCmp5', 'AcceptedCmp1']

# Discrete features
discrete_features = ['Year_Birth','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumWebVisitsMonth','Dt_Customer']

print(len(continuous_features)+len(categorical_features)+len(discrete_features)+1)

17


In [312]:
# 1-of-C Encoding for Categorical and Discrete Variables

def encodingHelper(list, item):
    newList = []
    for i in list:
        if str(i)==str(item):
            newList.append(1)
        else:
            newList.append(0)
    return newList

# The categorical variables are mapped to 1-of-C coding
new_categorical_features = []

for cat in categorical_features:
    items = sorted(featureEngineering_df[cat].unique())
    for item in items:
        featureEngineering_df[cat+"_"+str(item).replace(" ", "_")] = encodingHelper(featureEngineering_df[cat], item)
        new_categorical_features.append(cat+"_"+str(item).replace(" ", "_"))
    featureEngineering_df = featureEngineering_df.drop(columns=cat)
    

# All Discrete features should be classified as categorical
featureEngineering_df[new_categorical_features] = featureEngineering_df[new_categorical_features].astype('category') 
featureEngineering_df[marketing_target] = featureEngineering_df[marketing_target].astype('category')  

featureEngineering_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   MntMeatProducts       2240 non-null   int64   
 1   MntFishProducts       2240 non-null   int64   
 2   Income                2240 non-null   float64 
 3   MntFruits             2240 non-null   int64   
 4   MntSweetProducts      2240 non-null   int64   
 5   NumDealsPurchases     2240 non-null   int64   
 6   Dt_Customer           2240 non-null   int64   
 7   NumWebPurchases       2240 non-null   int64   
 8   NumCatalogPurchases   2240 non-null   int64   
 9   MntGoldProds          2240 non-null   int64   
 10  NumWebVisitsMonth     2240 non-null   int64   
 11  Response              2240 non-null   int64   
 12  Year_Birth            2240 non-null   int64   
 13  Teenhome              2240 non-null   category
 14  Education_2n_Cycle    2240 non-null   category
 15  Educ

In [316]:
# The continuous and discrete features are standardized to zero mean and unit variance

numerical_features = continuous_features + discrete_features

scaler = preprocessing.StandardScaler().fit(featureEngineering_df[numerical_features])
featureEngineering_df[numerical_features] = scaler.transform(featureEngineering_df[numerical_features])

print("__________________________________________Mean:")
print(featureEngineering_df[numerical_features].mean(axis=0))
print("___________________________________________STD:")
print(featureEngineering_df[numerical_features].std(axis=0))

# Note that the scaler cannot make the mean EXACTLY zero due to number representation limitations. 
# In practice, this is close enough.

__________________________________________Mean:
Income                 1.751451e-17
MntFruits             -4.173249e-17
MntMeatProducts        2.493045e-17
MntFishProducts        5.105043e-18
MntSweetProducts      -2.131232e-18
MntGoldProds          -2.874685e-17
Response               5.021777e-16
Year_Birth            -8.120687e-15
NumDealsPurchases     -4.909267e-17
NumWebPurchases        1.737202e-16
NumCatalogPurchases   -2.760689e-17
NumWebVisitsMonth     -4.158380e-17
Dt_Customer           -1.130048e-17
dtype: float64
___________________________________________STD:
Income                 1.000223
MntFruits              1.000223
MntMeatProducts        1.000223
MntFishProducts        1.000223
MntSweetProducts       1.000223
MntGoldProds           1.000223
Response               1.000223
Year_Birth             1.000223
NumDealsPurchases      1.000223
NumWebPurchases        1.000223
NumCatalogPurchases    1.000223
NumWebVisitsMonth      1.000223
Dt_Customer            1.000223
dtype

## Oversampling to Balance the Dataset

In [317]:
oversampling_df = featureEngineering_df.copy()

# Balancing (oversampling)
count_class_0, count_class_1 = oversampling_df[marketing_target].value_counts()
df_class_0 = oversampling_df[oversampling_df[marketing_target] == 0]
df_class_1 = oversampling_df[oversampling_df[marketing_target] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
final_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(final_df[marketing_target].value_counts()))

Random over-sampling:
1    1158
0    1158
Name: Teenhome, dtype: int64


In [319]:
final_df.to_csv("../data/preprocessedMarketingCampaign.csv")