## Customer Acquisition Prediction

In [124]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
from sklearn.linear_model import LogisticRegression

One could use a web browser to download the data, decompress the file and extract the CSV file, but it is preferable to create a small function to do that. It is useful in particular in cases where the data changes regularly, as it allows you to write a small script that you can run whenever you need to fetch the latest data . Automating the process of fetching the data is also useful if you need to install the dataset on multiple machines.

In [1]:
# Pandas is used for data manipulation
import pandas as pd

# Read in data as pandas dataframe and display first 15 rows
features = pd.read_csv('Email_Promotion_final.csv')
features.head(15)

Unnamed: 0,customer_id,resp,geo_group,res_type,Promotion
0,SB0000024239,,North America,42,3
1,SB0000107372,,North America,27,3
2,SB0000035225,,North America,24,4
3,SB0000115640,,North America,25,3
4,SB0000062290,,EU,24,2
5,SB0000108009,,North America,32,1
6,SB0000094585,,North America,29,4
7,SB0000079909,,North America,31,4
8,SB0000075756,,North America,33,1
9,SB0000110896,,EU,43,2


In [2]:
features['resp'] = features['resp'].apply(lambda x: 0 if x=='No' else 1)

In [3]:
print('The shape of our features is:', features.shape)

The shape of our features is: (2845, 5)


In [4]:
# Descriptive statistics for each column
features.describe()

Unnamed: 0,resp,res_type,Promotion
count,2845.0,2845.0,2845.0
mean,1.0,35.526186,2.497012
std,0.0,11.23088,1.111762
min,1.0,18.0,1.0
25%,1.0,26.0,2.0
50%,1.0,34.0,2.0
75%,1.0,42.0,3.0
max,1.0,88.0,4.0


## Possibility of customer acquisition using Logistic Regression

In [1]:
import pandas as pd
calls_log = pd.read_csv('Email_Promotion_final.csv')
calls_log.head(15)

Unnamed: 0,customer_id,resp,geo_group,res_type,Promotion
0,SB0000024239,Yes,North America,42,3
1,SB0000103256,No,EU,41,3
2,SB0000115357,No,North America,42,3
3,SB0000060380,No,North America,39,3
4,SB0000138548,No,Asia,43,1
5,SB0000080565,No,North America,44,1
6,SB0000100797,No,North America,50,4
7,SB0000017899,No,EU,29,2
8,SB0000083958,No,ROW,44,2
9,SB0000107372,Yes,North America,27,3


In [2]:
calls_log['resp'] = calls_log['resp'].apply(lambda x: 0 if x=='No' else 1)


In [3]:
from sklearn.preprocessing import LabelEncoder

geo_group = calls_log['geo_group'].values
#getting a copy
#Encoding
le = LabelEncoder()
le.fit(geo_group)
geo_group = le.transform(geo_group)
calls_log['geo_group'] = geo_group

In [4]:
X= calls_log.drop(columns=['resp','customer_id'])
y= calls_log['resp']

In [5]:
# import sklearn package to build the model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

model = LogisticRegression()

In [6]:
# Splitting the data into train and test 

from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=1)
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
predictions= model.predict(X_test)

In [8]:
from sklearn.metrics import classification_report

In [9]:
classification_report(y_test,predictions)

  'precision', 'predicted', average, warn_for)


'              precision    recall  f1-score   support\n\n           0       0.88      1.00      0.94      6632\n           1       0.00      0.00      0.00       868\n\n   micro avg       0.88      0.88      0.88      7500\n   macro avg       0.44      0.50      0.47      7500\nweighted avg       0.78      0.88      0.83      7500\n'

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[6632,    0],
       [ 868,    0]], dtype=int64)

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.8842666666666666

In [12]:
X_test.head()

Unnamed: 0,geo_group,res_type,Promotion
21492,2,51,3
9488,1,32,3
16933,2,40,2
12604,2,22,1
8222,1,25,4


Calls or crimes that involved most number of shootings are Assaualt and Murder.

In [13]:
data = pd.read_csv('Email_Promotion_test_small.csv')


In [14]:
from sklearn.preprocessing import LabelEncoder

geo_group = data['geo_group'].values
#getting a copy
#Encoding
le = LabelEncoder()
le.fit(geo_group)
geo_group = le.transform(geo_group)
data['geo_group'] = geo_group

In [15]:
data.head()

Unnamed: 0,customer_id,resp,geo_group,res_type,Promotion
0,SB0000024239,,2,42,3
1,SB0000107372,,2,27,3
2,SB0000035225,,2,24,4
3,SB0000115640,,2,25,3
4,SB0000062290,,1,24,2


In [16]:
data_test = data[['geo_group','res_type','Promotion']]

In [24]:
data_test.head()

Unnamed: 0,geo_group,res_type,Promotion
0,2,42,3
1,2,27,3
2,2,24,4
3,2,25,3
4,1,24,2


In [25]:
data_pred= model.predict(data_test)

In [26]:
len(data_pred)

2845

In [27]:
ones = [];
for i in range (len(data_pred)):
    if data_pred[i]=='1':
        ones.append(i)