# Import libraries

In [164]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
#pio.renderers.default = "notebook" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

# Read file with labels

In [165]:
data = pd.read_csv('Data/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [166]:
data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


# Explore dataset

In [167]:
# The dataset is quite big : you must create a sample of the dataset before making any visualizations !
data_sample = data.sample(10000)

Let's check some relations between our variables and the target *converted* : 

0. Correlation matrix :

In [168]:
# Correlation matrix
corr_matrix = data_sample.corr()

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist())


fig.show()

We see that there is an important reltion between the total pages visited and the conversion, let's discover it with a graph

1. with *total_pages_visited*

In [169]:
data_temp_0=data_sample.groupby('total_pages_visited').count().reset_index()

In [170]:
data_temp=data_sample.groupby('total_pages_visited').sum().reset_index()

In [171]:
px.bar(data_temp,x='total_pages_visited',y='converted')

we can see that its a kind of a gaussian function, that's what we gonna check next

In [172]:
somme=data_temp['converted'].sum()
data_temp['convertion_prc']=data_temp['converted']/somme

In [173]:
mean= (data_temp['total_pages_visited']*data_temp['convertion_prc']).sum()

In [174]:
std=data_sample.loc[data_sample['converted']==1]['total_pages_visited'].std()

In [175]:
std

4.060008503273308

In [176]:
from scipy.stats import norm


x_axis = np.arange(0, 30, 0.1)


# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=x_axis,
        y=norm.pdf(x_axis,mean,std),
        name='Gaussian function with mean = {} and std = {}'.format(round(mean,2),round(std,2))
                ))

fig.add_trace(
    go.Bar(
        x=data_temp['total_pages_visited'],
        y=data_temp['convertion_prc'],
        name='Prc of converted sales - real values'
    )
)

# Set title
fig.update_layout(
    title_text="Prcentage of converted sales, by pages visited "
)

This confirms somehow our first deduction. Which mean's that most people visit in average 15 pages before buying

Let's verify now the convertion rate depending on the nb of visited pages

In [177]:
data_temp['converstion_rate']=data_temp['converted']/data_temp_0['converted']

In [178]:
px.line(data_temp,x='total_pages_visited',y='converstion_rate',)

As expected, more pages you visite more you're up to buy the product

And the function looks like a sigmoid, let's check it

In [179]:
def sigmoid(x,lam=1,mid=0):
    return 1/(1+np.exp(-lam*(x-mid)))


In [180]:
from scipy.stats import norm


x_axis = np.arange(0, 30, 0.1)


# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=x_axis,
        y=sigmoid(x_axis,0.7,13.5),
        name='Sigmoid function'
                ))

fig.add_trace(
    go.Scatter(
        x=data_temp['total_pages_visited'],
        y=data_temp['converstion_rate'],
        name='Convertion rate - real values'
    )
)

# Set title
fig.update_layout(
    title_text="Evolution of convertion rate with visited pages "
)

That confirms our suspisions !!

All that means that we can only use this function for our classification and it will certunly gives us good results. But since we're in a challenge, the goal here is to have the best result !! 

Let's now check the other variables

country	age	new_user	source

2. with *country*

In [181]:
data_temp_0=data_sample.groupby('country').count().reset_index()
data_temp=data_sample.groupby('country').sum().reset_index()

data_temp['nb_users']=data_temp_0['converted']
data_temp['converstion_rate']=data_temp['converted']/data_temp['nb_users']

px.bar(data_temp,x='country',y='converstion_rate',title='Convertion rate by nation')

In [182]:
px.bar(data_temp,x='country',y='nb_users',title='Nb of users by nation')

UK has the bigest convertion rate, but it's not really significant since their users numbers are quite low comparing to US or China

3. with *age*

In [183]:
data_temp_0=data_sample.groupby('age').count().reset_index()
data_temp=data_sample.groupby('age').sum().reset_index()

data_temp['nb_users']=data_temp_0['converted']
data_temp['converstion_rate']=data_temp['converted']/data_temp['nb_users']

px.bar(data_temp,x='age',y='converstion_rate',title='Convertion rate by age')

In [184]:
px.bar(data_temp,x='age',y='converted',title='Number of convertion by age')

In [185]:
px.bar(data_temp,x='age',y='nb_users',title='Nb of users by age')

We can only see here that the conversion rate is 0 for all users that have more than 49 years

4. with *new_user*

In [186]:
data_temp_0=data_sample.groupby('new_user').count().reset_index()
data_temp=data_sample.groupby('new_user').sum().reset_index()

data_temp['nb_users']=data_temp_0['converted']
data_temp['converstion_rate']=data_temp['converted']/data_temp['nb_users']

px.bar(data_temp,x='new_user',y='converstion_rate',title='Convertion rate of new_users')

We can see here that a new user has less chances to buy the product than an old user

5. with *source*

In [187]:
data_temp_0=data_sample.groupby('source').count().reset_index()
data_temp=data_sample.groupby('source').sum().reset_index()

data_temp['nb_users']=data_temp_0['converted']
data_temp['converstion_rate']=data_temp['converted']/data_temp['nb_users']

px.bar(data_temp,x='source',y='converstion_rate',title='Convertion rate depending on source')

In [188]:
px.bar(data_temp,x='source',y='nb_users',title='Nb of users from sources')

We can see here that there are more users comming from SEO, but the other sources are not to be neglected since Ads has the biggest convertion rate

# Make your model

## Choose variables to use in the model, and create train and test sets
**From the EDA, we know that all features have impact on the convertion rate with the most useful feature is total_pages_visited. Let's create a baseline model by using at first only this feature : in the next cells, we'll make preprocessings and train a logistic regression.**

In [189]:
features_list = ['country',	'age',	'new_user',	'source', 'total_pages_visited']
numeric_indices = [1,4]
categorical2_indices = [0,3]
categorical1_indices = [2]
target_variable = 'converted'

In [190]:
X = data.loc[:, features_list]
Y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print()

Explanatory variables :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')



In [191]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [192]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[['China' 23 0 'Direct' 4]
 ['US' 25 1 'Direct' 8]
 ['US' 32 1 'Seo' 2]
 ['US' 37 1 'Seo' 3]
 ['UK' 24 1 'Direct' 4]]
[['US' 20 1 'Ads' 7]
 ['US' 31 1 'Seo' 5]]

[0 0 0 0 0]
[0 0]


## Training pipeline

In [193]:
categorical1_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

categorical2_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder()) # first column will be dropped to avoid creating correlations between features
    ])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_indices),
        ('cat', categorical1_transformer, categorical1_indices),
        ('cat2', categorical2_transformer, categorical2_indices)
    ])

In [194]:
# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")
X_train = preprocessor.fit_transform(X_train)
print("...Done")
print(X_train[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[-0.91516278 -0.26070136  0.          1.          0.          0.
   0.          0.          1.          0.        ]
 [-0.67320988  0.93728655  1.          0.          0.          0.
   1.          0.          1.          0.        ]
 [ 0.17362526 -0.85969532  1.          0.          0.          0.
   1.          0.          0.          1.        ]
 [ 0.7785075  -0.56019834  1.          0.          0.          0.
   1.          0.          0.          1.        ]
 [-0.79418633 -0.26070136  1.          0.          0.          1.
   0.          0.          1.          0.        ]]


In [195]:
# Train model
print("Train model...")
classifier = LogisticRegression() # 
classifier.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [196]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



## Test pipeline

In [197]:
# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test = preprocessor.transform(X_test)
print("...Done")
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[-1.27809213  0.63778957  1.          0.          0.          0.
   1.          1.          0.          0.        ]
 [ 0.05264881  0.03879562  1.          0.          0.          0.
   1.          0.          0.          1.        ]
 [-0.31028053 -0.26070136  0.          0.          0.          0.
   1.          0.          0.          1.        ]
 [-0.67320988 -0.26070136  1.          0.          1.          0.
   0.          1.          0.          0.        ]
 [ 1.62534265  0.63778957  0.          0.          0.          0.
   1.          1.          0.          0.        ]]


In [198]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



## Performance assessment

In [199]:
# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on train set :  0.7648957632817752
f1-score on test set :  0.7568513119533526


In [200]:
# You can also check more performance metrics to better understand what your model is doing
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

Confusion matrix on train set : 
[[246939    960]
 [  2536   5687]]

Confusion matrix on test set : 
[[27392   109]
 [  308   649]]



**Our baseline model reaches a f1-score of almost 76%. Now, let's refine our model and try to beat this score ! 🚀🚀**

In [201]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Perform grid search
print("Grid search...")
# Grid of values to be tested
params = {
    "C":np.logspace(-3,3,7),
    "penalty":["l1","l2"]
}
gridsearch = GridSearchCV(classifier, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best accuracy : ", gridsearch.best_score_)

Grid search...




35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' o

...Done.
Best hyperparameters :  {'C': 10.0, 'penalty': 'l2'}
Best accuracy :  0.9863463476086849


In [202]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...
...Done.
Predictions on test set...
...Done.
f1-score on train set :  0.7649788178333669
f1-score on test set :  0.7568513119533526


**There is some slightly better results but not sufficient, let's use diffrent models !!**

In [203]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import operator
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier

In [204]:

#pipeline parameters
parameters = \
    [ \
        {
            'clf': [LogisticRegression()],
            'C':np.logspace(-3,3,7),
            'penalty':["l1","l2"]
        },

        #{
         #   'clf': [SVC()],
          #  'C': [0.001, 0.1, 1],
           # 'kernel': ['linear', 'rbf']
        #},

        {
            'clf': [DecisionTreeClassifier()],
            'max_depth': [4, 6, 8, 10, 15],
            'min_samples_leaf': [1, 2, 5],
            'min_samples_split': [2, 4, 8,16]
        },
        
        #{
         #   'clf': [XGBRegressor()],
          #  'max_depth': [3, 4, 10, 15],
        #}

        #{
         #   'clf': [RandomForestClassifier()],
          #  'max_depth': [2, 4, 6, 8, 10],
           # 'min_samples_leaf': [1, 2, 5],
           # 'min_samples_split': [2, 4, 8],
           # 'n_estimators': [10, 20, 40, 60, 80, 100]
        #}

        
    ]

#evaluating multiple classifiers
#based on pipeline parameters
#-------------------------------
result=[]

for params in parameters:

    #classifier
    clf = params['clf'][0]

    #getting arguments by
    #popping out classifier
    params.pop('clf')

    #cross validation using
    #Grid Search
    grid = GridSearchCV(clf, param_grid=params, cv=3, scoring= 'f1')
    grid.fit(X_train, Y_train)

    #storing result
    result.append\
    (
        {
            'grid': grid,
            'classifier': grid.best_estimator_,
            'best score': grid.best_score_,
            'best params': grid.best_params_,
            'cv': grid.cv
        }
    )

#sorting result by best score
result = sorted(result, key=operator.itemgetter('best score'),reverse=True)

#saving best classifier
grid = result[0]['grid']




21 fits failed out of a total of 42.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' o

In [205]:
result

[{'grid': GridSearchCV(cv=3, estimator=LogisticRegression(),
               param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                           'penalty': ['l1', 'l2']},
               scoring='f1'),
  'classifier': LogisticRegression(C=10.0),
  'best score': 0.7642974816376572,
  'best params': {'C': 10.0, 'penalty': 'l2'},
  'cv': 3},
 {'grid': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
               param_grid={'max_depth': [4, 6, 8, 10, 15],
                           'min_samples_leaf': [1, 2, 5],
                           'min_samples_split': [2, 4, 8, 16]},
               scoring='f1'),
  'classifier': DecisionTreeClassifier(max_depth=6, min_samples_split=16),
  'best score': 0.755780468710653,
  'best params': {'max_depth': 6,
   'min_samples_leaf': 1,
   'min_samples_split': 16},
  'cv': 3}]

In [206]:
result

[{'grid': GridSearchCV(cv=3, estimator=LogisticRegression(),
               param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                           'penalty': ['l1', 'l2']},
               scoring='f1'),
  'classifier': LogisticRegression(C=10.0),
  'best score': 0.7642974816376572,
  'best params': {'C': 10.0, 'penalty': 'l2'},
  'cv': 3},
 {'grid': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
               param_grid={'max_depth': [4, 6, 8, 10, 15],
                           'min_samples_leaf': [1, 2, 5],
                           'min_samples_split': [2, 4, 8, 16]},
               scoring='f1'),
  'classifier': DecisionTreeClassifier(max_depth=6, min_samples_split=16),
  'best score': 0.755780468710653,
  'best params': {'max_depth': 6,
   'min_samples_leaf': 1,
   'min_samples_split': 16},
  'cv': 3}]

In [207]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = grid.predict(X_train)
print("...Done.")
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = grid.predict(X_test)
print("...Done.")
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...
...Done.
Predictions on test set...
...Done.
f1-score on train set :  0.7649788178333669
f1-score on test set :  0.7568513119533526


# Train best classifier on all data and use it to make predictions on X_without_labels
**Before making predictions on the file conversion_data_test.csv, let's train our model on ALL the data that was in conversion_data_train.csv. Sometimes, this allows to make tiny improvements in the score because we're using more examples to train the model.**

In [208]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

grid.fit(X,Y)



21 fits failed out of a total of 42.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Youness\anaconda3\envs\ipykernel_py2\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' o

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             scoring='f1')

In [209]:
# Read data without labels
data_without_labels = pd.read_csv('Data/conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
X_without_labels = data_without_labels

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = X_without_labels.values
print("...Done")

print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Convert pandas DataFrames to numpy arrays...
...Done
[['UK' 28 0 'Seo' 16]
 ['UK' 22 1 'Direct' 5]
 ['China' 32 1 'Seo' 1]
 ['US' 32 1 'Ads' 6]
 ['China' 25 0 'Seo' 3]]


In [210]:
# WARNING : PUT HERE THE SAME PREPROCESSING AS FOR YOUR TEST SET
# CHECK YOU ARE USING X_without_labels
print("Encoding categorical features and standardizing numerical features...")

X_without_labels = preprocessor.transform(X_without_labels)
print("...Done")
print(X_without_labels[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[-0.31028053  3.33326238  0.          0.          0.          1.
   0.          0.          0.          1.        ]
 [-1.03613923  0.03879562  1.          0.          0.          1.
   0.          0.          1.          0.        ]
 [ 0.17362526 -1.15919229  1.          1.          0.          0.
   0.          0.          0.          1.        ]
 [ 0.17362526  0.3382926   1.          0.          0.          0.
   1.          1.          0.          0.        ]
 [-0.67320988 -0.56019834  0.          1.          0.          0.
   0.          0.          0.          1.        ]]


In [211]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1
data_pred = {
    'converted': grid.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data_pred)
Y_predictions.to_csv('conversion_data_test_predictions_Youness_Logistic_Reg.csv', index=False)


## Analyzing the coefficients and interpreting the result
**In this template, we just trained a model with only one feature (total_pages_visited), so there's no analysis to be done about the feature importance 🤔**

**Once you've included more features in your model, please take some time to analyze the model's parameters and try to find some lever for action to improve the newsletter's conversion rate 😎😎**

In [212]:
X = data.loc[:, features_list]


In [213]:
[features_list[i] for i in categorical2_indices]

['country', 'source']

In [219]:
Categorical2_coefs_var=[]
for category in [features_list[i] for i in categorical2_indices]:
    x=np.sort(X[category].unique())
    x=x[~pd.isna(x)]
    x=[category+'_'+str(i) for i in x]
    Categorical2_coefs_var+=x

In [220]:
coefs_labels=[features_list[i] for i in numeric_indices]+['New_user_1']+Categorical2_coefs_var

In [221]:
px.bar(x=coefs_labels,y=grid.best_estimator_.coef_[0])

We can see here that :
- It's better to invest in Germany, UK and US rather than China. 
- Focusing more on existing young users and gain more
- Making the pages more attractive, in order to lead the user to visit the most
