In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

import seaborn as sns

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "svg"
from IPython.display import display

In [2]:
# Importing dataset
data = pd.read_csv('conversion_data_train.csv')

data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


- The column 'new_user' indicates whether the user has already visited the site: 1 - a user is new, i.e. have not visited the site before, 0 - the user is not new (have visited the site before).
- The column 'source' indicates where the user have come from ('seo': from a search engine page, 'ads': by clicking on an ad, 'Direct': direct traffic, ex. the user have typed the website's url directly or used a bookmark).
- The column 'converted' indicates whether the user subscribed to the site's newsletter: 1 - yes, 0 - no.

In [3]:
# Using 'describe' method to have an overview of the dataset
data.describe(include = "all")

print("Shape of the dataframe:", data.shape)

Shape of the dataframe: (284580, 6)


In [4]:
# Checking if there are missing data in the dataset
print("Percentage of missing values: ")
display(100*data.isnull().sum()/data.shape[0])

Percentage of missing values: 


country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

In [5]:
#Checking data types in the dataframe
data.dtypes

country                object
age                     int64
new_user                int64
source                 object
total_pages_visited     int64
converted               int64
dtype: object

In [6]:
# As the dataset is quie big, we will take a sample to make visualisations
data_sample = data.sample(5000)

In [7]:
num_features = ['age', 'total_pages_visited']
for f in num_features:
    fig = px.histogram(data_sample, f, color = 'converted', facet_row = 'converted', histnorm = 'probability')
    fig.show()

In [None]:
# Correlation matrix
corr_matrix = data_sample.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

In [None]:
corr = data_sample.corr()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(corr, annot=True, ax=ax)