# Carregando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import re
import time
import json

#tratamento dos dados
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import vstack

#modelagem
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#avaliação
from sklearn.metrics import roc_auc_score, average_precision_score

pd.set_option("max.columns",150)
#%matplotlib inline
#%pylab inline

# Carregando os dados

In [2]:
df = pd.read_csv("raw_data_com_labels.csv", index_col=0, delimiter=';')
df =df[df['p'].notnull()]
df.shape

(1549, 15)

# Criando um DF limpo para ser usado no modelo

In [3]:
df_limpo = pd.DataFrame(index=df.index)

In [4]:
df_limpo.head()

0
1
2
3
4


In [5]:
df.dtypes

title               object
p                    int64
view_count           int64
description         object
uploader            object
upload_date         object
categories          object
tags                object
duration           float64
webpage_url         object
like_count         float64
average_rating      object
query               object
tempo_desde_pub    float64
thumbnail           object
dtype: object

In [6]:
df_limpo['upload_date'] = df['upload_date']

In [7]:
df_limpo.head()

Unnamed: 0,upload_date
0,2021-01-01
1,2021-01-01
2,2021-01-01
3,2021-01-01
4,2021-01-01


In [8]:
df_limpo['view_count'] = df['view_count']
df_limpo.head()

Unnamed: 0,upload_date,view_count
0,2021-01-01,128
1,2021-01-01,1
2,2021-01-01,1
3,2021-01-01,13
4,2021-01-01,98


In [9]:
df_limpo['duration'] = df['duration'].fillna(0).astype('int')
df_limpo.head()

Unnamed: 0,upload_date,view_count,duration
0,2021-01-01,128,252
1,2021-01-01,1,102
2,2021-01-01,1,342
3,2021-01-01,13,1004
4,2021-01-01,98,3348


In [10]:
df_limpo['like_count'] = df['like_count'].fillna(0).astype('int')
df_limpo.head()

Unnamed: 0,upload_date,view_count,duration,like_count
0,2021-01-01,128,252,29
1,2021-01-01,1,102,3
2,2021-01-01,1,342,0
3,2021-01-01,13,1004,2
4,2021-01-01,98,3348,12


In [11]:
df_limpo['tempo_desde_pub'] = df['tempo_desde_pub'].fillna(0).astype('int')
df_limpo['title'] = df['title']
df_limpo.head()

Unnamed: 0,upload_date,view_count,duration,like_count,tempo_desde_pub,title
0,2021-01-01,128,252,29,28853,Introduction to Machine Learning | Ù…Ù‚Ø¯Ù…Ø© ...
1,2021-01-01,1,102,3,28853,Zero Python & Machine Learning Experience to D...
2,2021-01-01,1,342,0,28853,Mario I/O Machine Learning For Games
3,2021-01-01,13,1004,2,28853,Model Evaluation Tutorial | Machine Learning (...
4,2021-01-01,98,3348,12,28853,Online Certified Winter Training in Machine Le...


# Features

In [12]:
features = pd.DataFrame(index=df_limpo.index)
y = df['p'].copy()

In [13]:
(pd.to_datetime("2099-12-31")- pd.to_datetime(df_limpo["upload_date"])) / np.timedelta64(1,'D')

0       28853.0
1       28853.0
2       28853.0
3       28853.0
4       28853.0
         ...   
1544    29538.0
1545    29541.0
1546    29548.0
1547    29548.0
1548    29549.0
Name: upload_date, Length: 1549, dtype: float64

In [14]:
features['tempo_desde_pub'] = (pd.to_datetime("2099-12-31")- pd.to_datetime(df_limpo["upload_date"])) / np.timedelta64(1,'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)
features['like_count'] = df_limpo['like_count']
features['duration'] = df_limpo['duration']
features.head()

Unnamed: 0,views,views_por_dia,like_count,duration
0,128,0.004436,29,252
1,1,3.5e-05,3,102
2,1,3.5e-05,0,342
3,13,0.000451,2,1004
4,98,0.003397,12,3348


In [15]:
features.dtypes

views              int64
views_por_dia    float64
like_count         int32
duration           int32
dtype: object

# Transformar string em numero

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

mask_train = df_limpo['upload_date'] < '2020-06-15'
mask_val = df_limpo['upload_date'] >= '2020-06-15'

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((758, 4), (791, 4), (758,), (791,))

In [17]:
Xtrain

Unnamed: 0,views,views_por_dia,like_count,duration
280,16,0.000551,437,776
281,260,0.008948,41,262
282,25,0.000860,107,320
283,1,0.000034,110,1489
284,469,0.016137,25,700
...,...,...,...,...
1544,97,0.003284,6,3042
1545,1,0.000034,37,3364
1546,1,0.000034,0,1185
1547,1,0.000034,28,4019


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

#para tunar o TfidfVectorizer altere os parametros:
#ngram_range e mind_df
title_vec = TfidfVectorizer(min_df=2, 
                            ngram_range=(1,4))

title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

title_bow_train.shape, title_bow_val.shape

((758, 2362), (791, 2362))

In [19]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])
Xtrain_wtitle.shape, Xval_wtitle.shape

((758, 2366), (791, 2366))

In [31]:
A=np.array([features['views'], features['views_por_dia'], features['like_count'], features['duration']])
type(A)

numpy.ndarray

In [20]:
mdl_rf = RandomForestClassifier(n_estimators=1000, 
                             random_state=0, 
                             min_samples_leaf=1,
                             class_weight='balanced', 
                             n_jobs=8)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=8,
                       random_state=0)

In [21]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]

In [22]:
ap = average_precision_score(yval, p_rf)
auc = roc_auc_score(yval, p_rf)
print("ap {0}, auc {1}" .format(ap, auc))

ap 0.6266115544959638, auc 0.6455478136127264


In [None]:
# RF
# ap 0.5947022576368816, auc 0.6213304044880499 - mindf=2 | min_samples_leaf=1
# ap 0.5900455914171499, auc 0.6176255859825293 - mindf=2 | min_samples_leaf=2 (nao ficou bom, volta min_samples_leaf para 1)
# ap 0.5918350690775017, auc 0.6202064707841279 - mindf=1 | min_samples_leaf=1 (nao ficou bom, com mindf=1 é melhor)
# ap 0.5933163862452793, auc 0.6136549939800702 - mindf=3 | min_samples_leaf=1 (nao ficou bom, com mindf=2 é melhor)
# 
# ap 0.6126405596292688, auc 0.6406326049645208 - alterando o ngram_range(1,2)
# ap 0.6228697044180295, auc 0.642371340010759 - ngram=1,3
# ap 0.6266115544959638, auc 0.6455478136127264 - ngram=1,4 (este foi melhor)
# ap 0.621918956000376, auc 0.6452115941286471 - ngram=1,5 (começou a ficar ruim)


# LightGBM e Bayesian Optimization

In [None]:
# !pip install lightgbm

In [23]:
from lightgbm import LGBMClassifier

In [24]:
mdl_lgbm = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=8)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(class_weight='balanced', n_jobs=8, random_state=0)

In [25]:
p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]



In [26]:
ap = average_precision_score(yval, p_lgbm)
auc = roc_auc_score(yval, p_lgbm)
print("ap {0}, auc {1}" .format(ap, auc))

ap 0.5431495671105874, auc 0.5550055075953583


In [None]:
#sem tunar nao vence a RF
# ap 0.6266115544959638, auc 0.6455478136127264 - ngram=1,4 (este foi melhor)

In [None]:
!pip install scikit-optimize

In [None]:
from skopt import forest_minimize

In [None]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=8)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

In [None]:
# ap = 0.6101 auc = 0.6199951328227066

#[0.02040968500392374, 10, 10, 0.8065127710023762, 0.1427989294161636, 730, 3, 5]


In [None]:
res.x

In [28]:
params = [0.02040968500392374,
 10,
 10,
 0.8065127710023762,
 0.1427989294161636,
 730,
 3,
 5]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

#title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
#title_bow_train = title_vec.fit_transform(title_train)
#title_bow_val = title_vec.transform(title_val)

#Xtrain_wtitle = hstack([Xtrain, title_bow_train])
#Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]






In [29]:
ap = average_precision_score(yval, p_lgbm)
auc = roc_auc_score(yval, p_lgbm)
print("ap {0}, auc {1}" .format(ap, auc))

ap 0.5703275895680011, auc 0.5935330583805107


# Logistic Regression

In [None]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [None]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
Xval_wtitle2 = scaler.transform(Xval_wtitle2)

In [None]:
Xval_wtitle2.shape

In [None]:
mdl = LogisticRegression(C=1,n_jobs=8, random_state=0)
mdl.fit(Xtrain_wtitle2, ytrain)

In [None]:
p = mdl.predict_proba(Xval_wtitle2)[:, 1]

In [None]:
average_precision_score(yval, p), roc_auc_score(yval, p)

In [None]:
### c = 0.5
# standard = (0.5231246471114438, 0.5451110484924558)
# max      = (0.6257356371363406, 0.6511834925839588)

#C=1 (este foi melhor)
#(0.6283906364135851, 0.6527333043010477)

#c=10
# (0.6216164878389561, 0.6465980992391833)

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=1, penalty='l2',n_jobs=8, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

In [None]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [None]:
average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

In [None]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(yval, p), roc_auc_score(yval, p)

In [None]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

In [None]:
p1 = average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)
p2 = average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)
p3 = average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)
print("LR = {}".format(p1))
print("RF = {}".format(p2))
print("LGBM = {}".format(p3))

In [None]:
p = 0.2*p_rf + 0.8*p_lgbm
average_precision_score(yval, p), roc_auc_score(yval, p)

In [None]:
#(0.624290020880282, 0.6395406921638446) 50/50
#(0.6239057753147723, 0.6357686297615084) 40/60 (pior)
#(0.6268357416439372, 0.642422573455952) 60/40 (melhorou)
#(0.6279119062688748, 0.6449522248123576) 70/30 (melhor ainda)
#(0.6282607174657145, 0.6468094372006046) 80/20 (melhorou mais ainda)
#(0.6269306882398227, 0.6465340574326921) 90/10 (ficou ruim)

# Salvar modelos

In [30]:
import joblib as jb

In [27]:
jb.dump(mdl_rf, "random_forest_20210103.pkl.z")

['random_forest_20210103.pkl.z']

In [31]:
jb.dump(mdl_lgbm, "lgbm_20210103.pkl.z")

['lgbm_20210103.pkl.z']

In [None]:
jb.dump(lr_pipeline, "logistic_reg_20210103.pkl.z")

In [28]:
jb.dump(title_vec, "title_vectorizer_20210103.pkl.z")

['title_vectorizer_20210103.pkl.z']