In [2]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

from lightgbm import LGBMClassifier

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
df1 = pd.read_csv("dados/raw_data_with_labels.csv").dropna(subset=["y"])
print(df1.shape)
df2 = pd.read_csv("dados/active_labels1_done.csv", index_col=0).dropna(subset=["y"]).drop(columns=['p'])
print(df2.shape)
df_active_learning = pd.read_csv('dados/raw_data_with_labels_active_learning.csv').dropna(subset=["y"])
print(df_active_learning.shape)
df = pd.concat([df1,df2, df_active_learning], axis=0)
print(df.shape)

(772, 21)
(200, 21)
(1217, 21)
(2189, 21)


In [4]:
print(df.duplicated().mean())
df.drop_duplicates(inplace=True)
print(df.duplicated().mean())

0.07172224760164458
0.0


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## 1. e 2. Limpeza da data

In [6]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']
df_limpo['date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df_limpo['views'] = df['view_count'].map(lambda x: 0 if x<0 else int(x))

## 3. Features

In [7]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [8]:
features = pd.DataFrame(index=df_limpo.index)
data_extract = datetime.datetime.strptime('2020-11-21', "%Y-%m-%d")

features['tempo_desde_pub'] = (data_extract - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [9]:
features.head()

Unnamed: 0,views,views_por_dia
0,379,94.75
1,124,20.666667
2,13,1.857143
3,655,72.777778
4,1,0.090909


In [10]:
data_corte = "2020-10-01"
mask_train = df_limpo['date'] < data_corte
mask_val = (df_limpo['date'] >= data_corte)

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((1560, 2), (472, 2), (1560,), (472,))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,4))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [12]:
title_bow_train.shape

(1560, 5141)

In [13]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [14]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((1560, 5143), (472, 5143))

# 4 RF

In [15]:
mdl_rf = RandomForestClassifier(class_weight='balanced', max_depth=926,
                                        max_features='log2', min_samples_leaf=1,
                                        min_samples_split=2, n_estimators=1193,
                                        n_jobs=6, random_state=123)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', max_depth=926,
                       max_features='log2', n_estimators=1193, n_jobs=6,
                       random_state=123)

In [16]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]

In [17]:
average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)

(0.6353631240364814, 0.7620367437440607)

# 5 LGBM

In [18]:
params = [0.05669659554050318, 1, 6, 0.9046083457312004, 0.7251165239960375, 879]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = 2
ngram_range = (1, 4)

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]




In [19]:
average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)

(0.664624318709879, 0.7800621634463097)

# 7 Logistic Reg

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(display='diagram')

In [21]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [32]:
# Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
# Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

# scaler = StandardScaler()
#scaler = MaxAbsScaler()

# Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
# Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

preprocessor_dense = ColumnTransformer(
    transformers=[
        ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True), [0,1]),
        ('rest', SimpleImputer(strategy='most_frequent'), np.linspace(2,Xtrain_wtitle.shape[1]-1, Xtrain_wtitle.shape[1]-2, dtype=int))
        ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(with_mean=False), [0,1]),
        ('rest', SimpleImputer(strategy='most_frequent'), np.linspace(2,Xtrain_wtitle.shape[1]-1, Xtrain_wtitle.shape[1]-2, dtype=int))
        ])

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor_dense),
                              ('preprocessor2', preprocessor),
                              ('classifier', LogisticRegression(C=10, n_jobs=6, random_state=0))])

lr_pipeline.fit(csr_matrix(Xtrain_wtitle), ytrain)

In [33]:
p_lr = lr_pipeline.predict_proba(csr_matrix(Xval_wtitle))[:, 1]

In [34]:
average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.686366624650308, 0.7998099461514095)

# 8 Ensemble

(0.6353631240364814, 0.7620367437440607) RF  
(0.664624318709879,  0.7800621634463097) LGBM  
(0.686366624650308,  0.7998099461514095) LR  

In [25]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.6710924617955591, 0.7940093443142223)

In [26]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.903238,0.896303
RF,0.903238,1.0,0.891817
LGBM,0.896303,0.891817,1.0


In [38]:
p = 0.0*p_rf + 0.7*p_lr+0.3*p_lgbm
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.6857993789397483, 0.8018094710167881)

(0.23117553771909904, 0.6964675355310491) - 0.5/0.5  
(0.23866391160240463, 0.6962906174441233) - 0.4/0.6  
(0.2449271153955049, 0.6967329126614378) - 0.3/0.7  
(0.24568903874837777, 0.6967329126614378) - 0.2/0.8  

(0.24567146005469367, 0.6897151618800496) - 0.3/0.7 - lgbm ngram 1,3
(0.24809974466463763, 0.690865129445067) - 0.4/0.6 

In [None]:
# reduzir complexidade do vectorizer

# 9 Salvar modelos

In [42]:
import joblib as jb
import dill

In [56]:
jb.dump(mdl_lgbm, "modelos/lgbm_20200208.pkl.z")
jb.dump(mdl_rf, "modelos/random_forest_20200208.pkl.z")
# jb.dump(lr_pipeline, "logistic_reg_20200208.pkl.z")
with open('modelos/logistic_reg_20200208.pkl.z', 'wb') as file:
    dill.dump(lr_pipeline, file)
jb.dump(title_vec, "modelos/title_vectorizer_20200208.pkl.z")

['title_vectorizer_20200208.pkl.z']

Para ledo do dill


In [58]:

with open('logistic_reg_20200208.pkl.z', 'rb') as file:
    B = dill.load(file)

In [59]:
B.predict_proba(Xval_wtitle2)[:, 1]

array([5.96883043e-01, 1.08192292e-02, 6.02835793e-04, 5.49007304e-01,
       7.45698556e-01, 1.40884878e-02, 4.80221532e-02, 5.42009012e-01,
       2.96478971e-02, 5.72679088e-02, 1.46224944e-01, 1.79306404e-01,
       1.33632700e-01, 4.08135535e-01, 7.81327923e-02, 4.07242989e-01,
       1.71316796e-02, 9.25540929e-01, 1.07878515e-01, 1.35807860e-01,
       2.68446090e-02, 2.90589270e-02, 9.17312671e-02, 6.06504979e-01,
       5.84671266e-02, 4.71346620e-02, 5.96883043e-01, 5.21369359e-03,
       6.92011029e-03, 5.68193844e-03, 1.88721086e-01, 1.68708994e-02,
       3.47757072e-01, 2.85456202e-01, 2.67362611e-01, 3.30041317e-01,
       1.79728314e-01, 5.15253672e-01, 5.96079152e-01, 7.82803569e-01,
       3.35254690e-02, 1.69067985e-01, 4.61902562e-02, 9.30660993e-02,
       2.63286180e-01, 4.15470977e-01, 6.43751204e-01, 3.93959900e-01,
       2.23635881e-01, 7.38869137e-02, 3.80702295e-01, 3.97441478e-02,
       1.76530062e-01, 1.76017290e-01, 5.48989295e-02, 5.28894779e-01,
      