In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

import seaborn as sns

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "vscode"
from IPython.display import display

In [None]:
# Importing dataset
data = pd.read_csv('conversion_data_train.csv')

data.head()

- The column 'new_user' indicates whether the user has already visited the site: 1 - a user is new, i.e. have not visited the site before, 0 - the user is not new (have visited the site before).
- The column 'source' indicates where the user have come from ('seo': from a search engine page, 'ads': by clicking on an ad, 'Direct': direct traffic, ex. the user have typed the website's url directly or used a bookmark).
- The column 'converted' indicates whether the user subscribed to the site's newsletter: 1 - yes, 0 - no.

In [None]:
# Using 'describe' method to have an overview of the dataset
data.describe(include = "all")

print("Shape of the dataframe:", data.shape)

In [None]:
# Checking if there are missing data in the dataset
print("Percentage of missing values: ")
display(100*data.isnull().sum()/data.shape[0])

In [None]:
#Checking data types in the dataframe
data.dtypes

In [None]:
# As the dataset is quie big, we will take a sample to make visualisations
data_sample = data.sample(1000)

In [None]:
num_features = ['age', 'total_pages_visited']
for f in num_features:
    fig = px.histogram(data_sample, f, color = 'converted', facet_row = 'converted', histnorm = 'probability')
    fig.show()

In [None]:
# Correlation matrix
corr_matrix = data_sample.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

In [None]:
corr = data_sample.corr()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(corr, annot=True, ax=ax)