In [23]:
#First we need to import all the libraries.
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [24]:
# Load the dataset
df = pd.read_csv('ConversionRate/conversion_data.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,UK,25,1,Ads,1,0
1,US,23,1,Seo,5,0
2,US,28,1,Seo,4,0
3,China,39,1,Seo,5,0
4,US,30,1,Seo,6,0


In [5]:
# Get values of country
df['country'].unique()

array(['UK', 'US', 'China', 'Germany'], dtype=object)

In [3]:
# Calculate current conversion rate
converted = df['converted'].sum()
sessions = df.shape[0]
conversion_rate = converted/sessions
conversion_rate

0.03225806451612903

In [4]:
# Calculate average page views
avg_pg_views = df['total_pages_visited'].mean()
avg_pg_views

4.872966476913346

In [5]:
# What percent of source variable is Seo?
df[df['source']=='Seo'].count() / sessions

country                0.490323
age                    0.490323
new_user               0.490323
source                 0.490323
total_pages_visited    0.490323
converted              0.490323
dtype: float64

In [6]:
# What percent of source variable is Ads?
df[df['source']=='Ads'].count() / sessions

country                0.280645
age                    0.280645
new_user               0.280645
source                 0.280645
total_pages_visited    0.280645
converted              0.280645
dtype: float64

In [7]:
# What percent of source variable is Direct?
df[df['source']=='Direct'].count() / sessions

country                0.229032
age                    0.229032
new_user               0.229032
source                 0.229032
total_pages_visited    0.229032
converted              0.229032
dtype: float64

In [8]:
# # Plot the distribution of age
# df['age'].hist(by=df['converted'])
# plt.show()

In [15]:
# # Plot the distribution of country who have converted
df[df['converted']==1].country.value_counts().plot(kind='bar')
# plt.ion()
plt.show()
plt.close()

In [16]:
# # Plot the distribution of country who have converted
df[df['converted']==0].country.value_counts().plot(kind='bar')
# plt.ion()
plt.show()
plt.close()

In [25]:
# Delete country variable
del df['country']

In [26]:
# Delete source variable
del df['source']

In [27]:
# Features
X = df.ix[:,'age':'total_pages_visited']

In [28]:
# Labels
y = df['converted']

In [29]:
# Evaluate using a train and a test set
test_size = 0.30
seed = 7
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print("Accuracy: {}".format(result*100.0))

Accuracy: 98.50938224752267


In [30]:
# Evaluate using Cross Validation
num_folds = 10
num_instances = len(X)
seed = 1
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_validation.cross_val_score(model, X, y, cv=kfold)
print("Accuracy: {} ({})".format(results.mean()*100.0, results.std()*100.0))

Accuracy: 98.51391524351676 (0.08103118534802811)


In [32]:
# Evaluate using Shuffle Split Cross Validation
num_samples = 10
test_size = 0.3
num_instances = len(X)
seed = 7
kfold = cross_validation.ShuffleSplit(n=num_instances, n_iter=num_samples, test_size=test_size, random_state=seed)
model = LogisticRegression()
results = cross_validation.cross_val_score(model, X, y, cv=kfold)
print("Accuracy: {} ({})".format(results.mean()*100.0, results.std()*100.0))

Accuracy: 98.5125448028674 (0.03981694997747037)


In [34]:
# Gaussian Naive Bayes Classification
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = GaussianNB()
results = cross_validation.cross_val_score(model, X, y, cv=kfold)
print("Accuracy: {} ({})".format(results.mean()*100.0, results.std()*100.0))

Accuracy: 98.30613535736875 (0.08247177917305785)


In [35]:
# prepare configuration for cross validation test harness
num_folds = 10
num_instances = len(X)
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', GaussianNB()))

In [36]:
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.985139 (0.000810)
NB: 0.983061 (0.000825)
