In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df1 = pd.read_csv("dados/raw_data_with_labels.csv").dropna(subset=["y"])
df1.shape

(772, 21)

In [3]:
df2 = pd.read_csv("dados/active_labels1_done.csv", index_col=0).dropna(subset=["y"]).drop(columns=['p'])
df2.shape

(200, 21)

In [4]:
df_active_learning = pd.read_csv('dados/raw_data_with_labels_active_learning.csv').dropna(subset=["y"])
df_active_learning.shape

(1217, 21)

In [5]:
df = pd.concat([df1,df2, df_active_learning], axis=0)
df.shape

(2189, 21)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().mean()

0.0

In [8]:
df.duplicated(['title']).mean()

0.06594488188976377

In [9]:
df.shape

(2032, 21)

In [10]:
df.reset_index(inplace=True, drop=True)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## 1. e 2. Limpeza dados

In [12]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']
df_limpo['date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df_limpo['views'] = df['view_count'].map(lambda x: 0 if x<0 else int(x))


## 3. Features

In [13]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [14]:
features = pd.DataFrame(index=df_limpo.index)
data_extract = datetime.datetime.strptime('2020-11-21', "%Y-%m-%d")

features['tempo_desde_pub'] = (data_extract - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [15]:
features.head()

Unnamed: 0,views,views_por_dia
0,379,94.75
1,124,20.666667
2,13,1.857143
3,655,72.777778
4,1,0.090909


In [16]:
data_corte = "2020-10-01"
mask_train = df_limpo['date'] < data_corte
mask_val = (df_limpo['date'] >= data_corte)

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((1560, 2), (472, 2), (1560,), (472,))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,4))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


intro to machine learning -> intro, to, machine, learning  -> ngram_range=(1,1)  
intro to machine learning -> intro, to, machine, learning, intro to, to machine, machine learning -> ngram_range=(1,2)  
intro to machine learning -> intro to, to machine, machine learning -> ngram_range=(2,2)  


In [18]:
title_bow_train.shape

(1560, 5141)

In [19]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [20]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((1560, 5143), (472, 5143))

# 4 Random Forest

In [21]:
# mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
# mdl.fit(Xtrain_wtitle, ytrain)

In [22]:
mdl = RandomForestClassifier(class_weight='balanced', max_depth=2,
                                        max_features='log2', min_samples_leaf=3,
                                        min_samples_split=4, n_estimators=1000,
                                        n_jobs=6, random_state=123)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', max_depth=2,
                       max_features='log2', min_samples_leaf=3,
                       min_samples_split=4, n_estimators=1000, n_jobs=6,
                       random_state=123)

In [23]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [24]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [25]:
average_precision_score(yval, p)

0.5993872676901968

In [26]:
roc_auc_score(yval, p)

0.7102074754513779

dados antigos

ap 0.5235463345585548, auc 0.665264593872563 - mindf=2

aumentando a validação

ap 0.5212442841200238, auc 0.6644575163398694 - mindf=1
ap 0.5301462501031342, auc 0.6810588235294117 - mindf=2

aumentando o treino mantendo validacao

ap 0.5437704978451068 auc 0.6669271869493549 - mindf=2

aumentando tudo

ap 0.5483320689705957 auc 0.6671111111111112 - mindf=2




#### RF ap 0.5993872676901968, auc 0.7102074754513779 - min_df=2, ngram_range=(1,4)  

In [28]:
from skopt import forest_minimize

In [29]:
def tune_rf(params):
    print(params)
    max_depth = params[0]
    max_features = params[1]
    min_samples_leaf = params[2]
    min_samples_split = params[3]
    n_estimators = params[4]
    
    min_df = 2#params[6]
    ngram_range = (1,4)#(1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = RandomForestClassifier(class_weight='balanced', max_depth=max_depth,
                                        max_features=max_features, min_samples_leaf=min_samples_leaf,
                                        min_samples_split=min_samples_split, n_estimators=n_estimators,
                                        n_jobs=6, random_state=123)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -roc_auc_score(yval, p)


space = [(10, 1000), # max_depth
          ('auto', 'sqrt', 'log2'), # max_features
          (1, 10), # min_samples_leaf
          (2, 20), # min_samples_split
          (100, 2000)] # n_estimators
          #(1,5), # min_df
          #(1,5)] # ngram_range

res = forest_minimize(tune_rf, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[868, 'sqrt', 5, 2, 834]
0.7070597085840988
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.4890
Function value obtained: -0.7071
Current minimum: -0.7071
Iteration No: 2 started. Evaluating function at random point.
[164, 'sqrt', 9, 3, 269]
0.6966166455495725
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.5107
Function value obtained: -0.6966
Current minimum: -0.7071
Iteration No: 3 started. Evaluating function at random point.
[714, 'log2', 6, 3, 1134]
0.7101579822616408
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.7553
Function value obtained: -0.7102
Current minimum: -0.7102
Iteration No: 4 started. Evaluating function at random point.
[901, 'auto', 6, 13, 954]
0.7033774152676591
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.6047
Function value obtained: -0.7034
Current minimum: -0.7102
Iteration No: 5 started. Evaluating functi

In [30]:
params = res.x
print(params)
max_depth = params[0]
max_features = params[1]
min_samples_leaf = params[2]
min_samples_split = params[3]
n_estimators = params[4]

min_df = 2#params[6]
ngram_range = (1,4)#(1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl = RandomForestClassifier(class_weight='balanced', max_depth=max_depth,
                                        max_features=max_features, min_samples_leaf=min_samples_leaf,
                                        min_samples_split=min_samples_split, n_estimators=n_estimators,
                                        n_jobs=6, random_state=123)
mdl.fit(Xtrain_wtitle, ytrain)

p = mdl.predict_proba(Xval_wtitle)[:, 1]

average_precision_score(yval, p), roc_auc_score(yval, p)

[926, 'log2', 1, 2, 1193]


(0.6353631240364814, 0.7620367437440607)

# 5 LightGBM

In [31]:
from lightgbm import LGBMClassifier

In [32]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(class_weight='balanced', n_jobs=6, random_state=0)

In [33]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [34]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.5847352924773831, 0.7459019638897688)

# 6 Bayesian Optimization

In [35]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = 2#params[6]
    ngram_range = (1,4)#(1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000),] # n_estimators
          #(1,5), # min_df
          #(1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272]
0.7364388660120368
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 2.5896
Function value obtained: -0.6156
Current minimum: -0.6156
Iteration No: 2 started. Evaluating function at random point.
[0.0010385556240017917, 2, 10, 0.14183771058242609, 0.7437489153990157, 249]
0.6637333702882483
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2778
Function value obtained: -0.5731
Current minimum: -0.6156
Iteration No: 3 started. Evaluating function at random point.
[0.00209745522423282, 5, 6, 0.1541824778996655, 0.8682075103820793, 273]
0.7049414000633514
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.6779
Function value obtained: -0.5689
Current minimum: -0.6156
Iteration No: 4 started. Evaluating function at random point.
[0.016490254525097375, 9, 9, 0.6502182010234373, 0.6866210554187129, 82

In [36]:
res.x

[0.05669659554050318, 1, 6, 0.9046083457312004, 0.7251165239960375, 879]

### lr, max_depth, min_child_samples, subsample, colsample_bytree, n_estimators, min_df, ngram_range

In [37]:

params = res.x
print(params)
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = 2#params[6]
ngram_range = (1,4)#(1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                        class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

p = mdl.predict_proba(Xval_wtitle)[:, 1]

average_precision_score(yval, p), roc_auc_score(yval, p)

[0.05669659554050318, 1, 6, 0.9046083457312004, 0.7251165239960375, 879]


(0.664624318709879, 0.7800621634463097)

# 7 Logistic Reg

In [38]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [39]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

scaler = StandardScaler()
# scaler = MaxAbsScaler()


Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

# Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
# Xval_wtitle2 = scaler.transform(Xval_wtitle2)

  self._set_arrayXarray(i, j, x)


In [58]:
Xval_wtitle2.shape

(472, 2606)

In [40]:

mdl = LogisticRegression(C=10,n_jobs=6, random_state=0)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=10, n_jobs=6, random_state=0)

In [41]:
p = mdl.predict_proba(Xval_wtitle2)[:, 1]

In [42]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.6863234204050065, 0.8000475134621476)

In [None]:
(0.6830179524739917, 0.7977510294583465) - sem tuning, standardscaler, C=10

(0.6573139874877728, 0.776805511561609) - sem tuning, maxabsscaler
(0.6284990574361318, 0.7407546721571111) - C=0.1, maxabsscaler

(0.6656405691163344, 0.7787456445993031) - C=0.5, maxabsscaler