In [2]:
# Import all the necessary libraries to perform the analysis.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import scipy.stats as norm
from sklearn import linear_model, preprocessing
%matplotlib inline

In [3]:
# Create a connection to the SQL database.
database = "sqlite_adc.db"
connection = sqlite3.connect(database)

# Use a UNION to combine the CurrentAd and GroupC tables.
query = """
    SELECT * FROM CurrentAd
    UNION
    SELECT * FROM GroupC
"""

# Read the SQL query and store the data in a dataframe.
df = pd.read_sql_query(query, connection)

# Display the first 5 rows of the dataframe.
df.head()

Unnamed: 0,index,uuid,num_achievements,num_exercises,num_points,Click,Group,Button,Banner
0,0,981943945,0,0,0,1.0,CurrentAd,No,Yes
1,2,981944745,0,0,0,0.0,CurrentAd,No,Yes
2,3,981944801,0,0,0,0.0,CurrentAd,No,Yes
3,4,981944857,0,0,0,0.0,CurrentAd,No,Yes
4,6,981960025,0,0,0,0.0,CurrentAd,No,Yes


In [4]:
# Subset the Current Ad data and the Ad C data into their own dataframes.
ad_c = df.loc[df.Group == "C"]
ad_cur = df.loc[df.Group == "CurrentAd"]

In [18]:
# Store the sample size we're using to run the analysis.
sample_size = 12000

In [28]:
# Calcuate the number of users from the sample group that clicked on the Current Ad.
ad_cur_clicks = float(len(ad_cur.head(sample_size).loc[ad_cur.Click == 1, "Click"]))
print (ad_cur_clicks)

# Print the click-through rate of the Current Ad.
print("{:.2%}".format(ad_cur_clicks / sample_size))

332.0
2.77%


In [29]:
# Calcuate the number of users from the sample group that clicked on Ad C. 
ad_c_clicks = float(len(ad_c.head(sample_size).loc[ad_c.Click == 1, "Click"]))

print (ad_c_clicks)
# Print the click-through rate of Ad C.
print("{:.2%}".format(ad_c_clicks / sample_size))

1081.0
9.01%


In [8]:
# Run a two sample, independent t-test (clicks of the Current Ad and clicks of Ad C). 
def print_ttest_results(sample1, sample2):
    a = 0.05
    t = norm.ttest_ind(sample1, sample2)[0]
    p = norm.ttest_ind(sample1, sample2)[1]

    print(str(t) + " : " + str(p))
    if p < a: print("This is significant!")
    else: print("This is not significant!")

print_ttest_results(ad_cur.Click, ad_c.Click)

-21.7706434913 : 3.69110937618e-104
This is significant!


In [27]:
# Machine learning model to train the accuracy of predicting if a user will click on an ad.

# Clean the data set to be trained by a machine learning model.
def clean_data(data):
    data.Group = data.Group.astype("str")
    data.Button = data.Button.astype("str")
    data.Banner = data.Banner.astype("str")

    data.loc[data.Group == "CurrentAd", "Group"] = 0
    data.loc[data.Group == "C", "Group"] = 1
    
    data.loc[data.Button == "No", "Button"] = 0
    data.loc[data.Button == "Yes", "Button"] = 1
    
    data.loc[data.Banner == "No", "Banner"] = 0
    data.loc[data.Banner == "Yes", "Banner"] = 1
    
    data.Group = data.Group.astype("int")
    data.Button = data.Button.astype("int")
    data.Banner = data.Banner.astype("int")

train = df
clean_data(train)

# The target we want to train the model to predict.
target = train.Click

# Store the values of the features (measureable properties) that can help the model recognize patterns.
features = train[["num_achievements", "num_exercises", "num_points", "Group", "Button", "Banner"]].values

# Supervised learning model (learning from a data set of correctly identified observations)
classifier = linear_model.LogisticRegression()
poly = preprocessing.PolynomialFeatures() 
poly_features = poly.fit_transform(features)

# Train the model on the data and store the information learned.
# The model is learning the relationship between the features and the target.
classifier_ = classifier.fit(poly_features, target)

# Print the score (accuracy) of the model.
print("Polynomial Model Accuracy: " + "{:.2%}".format(classifier_.score(poly_features, target)))

Polynomial Model Accuracy: 94.33%
