In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [2]:
os.chdir('/content/drive/MyDrive/111_goldfinger/data_modeling')

In [None]:
!pip install shapash

In [None]:
!pip install matplotlib==3.4
!pip install sklearn


In [4]:

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") # to avoid deprecation warnings
import sys


#Graph libraries

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
%matplotlib inline


#Preprocessing libraries

from sklearn.model_selection import train_test_split


#Model Selection

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base



In [5]:
#Loading dataset
df = pd.read_csv('/content/drive/MyDrive/111_goldfinger/data_modeling/tech.csv')

In [6]:
df.sample()

Unnamed: 0.1,Unnamed: 0,content_len,title_len_char,h1_len,nb_h2,nb_h3,nb_links,has_canonical,is_top_ten,content_score,title_score,technologies,majestic,mjrank,a_rank,ref_sn,ref_ip,spend
59908,59908,8789,86,7,1,3,0,1,1,0.82397,45.454545,109.0,300384.0,303776.0,626843.0,698.0,846.0,758.0


In [7]:
df = df.drop(columns=['Unnamed: 0'])

In [8]:
df = df[['content_len', 'title_len_char', 'h1_len', 'nb_h3', 'nb_links',
       'has_canonical', 'is_top_ten', 'content_score', 'title_score',
      'ref_sn']]

In [9]:
df.dropna(subset=['ref_sn'], inplace=True)

In [10]:
df = df[df.ref_sn != 0]

# **Part II Preprocessing & model selection**

# Pipelines

Splitting features & target

In [11]:
target = 'is_top_ten'

X = df.loc[:, df.columns != target]
Y = df.loc[:,target] 

Y = Y.apply(lambda x: float(x[1:]) if type(x)==str else x)

#Split the data into a train set and test set 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

print("...train test split Done !")

...train test split Done !


# **PART II : Model fitting**

In [12]:
#Instanciate models

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

model = RandomForestClassifier(max_depth= 15, 
                                  n_estimators= 300
                                  ,max_leaf_nodes=1024)
model.fit(X_train, Y_train)
Y_train_pred    = model.predict(X_train)
Y_test_pred     = model.predict(X_test)
score           = model.score(X_test, Y_test)
accuracy_test   = accuracy_score(Y_test, Y_test_pred)
recall_test     = recall_score(Y_test, Y_test_pred)
f1_score_test   = f1_score(Y_test, Y_test_pred)
print('model : ', model, ' - score : ', score,  'accuracy_test : ', accuracy_test, 'recall : ', recall_test, 'f1_score_test' , f1_score_test)

model :  RandomForestClassifier(max_depth=15, max_leaf_nodes=1024, n_estimators=300)  - score :  0.7272326593576112 accuracy_test :  0.7272326593576112 recall :  0.5770042194092827 f1_score_test 0.6386922571699627


# Understand my model with shapash

Declare and Compile SmartExplainer

In [13]:
from shapash.explainer.smart_explainer import SmartExplainer

In [14]:
xpl = SmartExplainer()

In [16]:
Y_pred = pd.DataFrame(model.fit(X_train, Y_train).predict(X_test),columns=['pred'],index=X_test.index)

In [17]:
xpl.compile(
    x       = X_test,
    model   = model.fit(X_train, Y_train),
    y_pred  = Y_pred
            )


Backend: Shap TreeExplainer


In [18]:
xpl.plot.features_importance()

**Understand how a feature contributes**

In [19]:
xpl.plot.contribution_plot("content_len")

In [22]:
xpl.plot.contribution_plot('nb_links')

In [20]:
xpl.plot.contribution_plot('ref_sn')

In [57]:
xpl.add(y_pred=Y_pred)
xpl.plot.contribution_plot(col='content_score')


In [58]:
xpl.plot.top_interactions_plot(nb_top_interactions=5)