In [1]:
import pandas as pd
import numpy as np
import tqdm
import datetime
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from scipy import sparse

pd.set_option("max.columns", None)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("raw_data_all_labeled2.csv", index_col=0).dropna(subset=['y'])
df.shape

(1410, 15)

In [3]:
df.duplicated().sum()

0

In [4]:
df.duplicated(['title']).sum()

0

### 1. Limpeza da data

In [5]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']
df_limpo['date'] = pd.to_datetime(df['upload_date'])

### 2. Limpeza de views

In [6]:
views = df['view_count'].fillna(0)
df_limpo['views'] = views

### 3. Features 

In [7]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()
today = datetime.datetime.today().strftime("%Y-%m-%d")
features['tempo_desde_pub'] = (pd.to_datetime(today) -  df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = (features['views'] / features['tempo_desde_pub']).round(3)
features.drop(['tempo_desde_pub'], axis=1, inplace=True)

In [8]:
mask_train = df_limpo['date'] < '2020-03-10'
mask_val = df_limpo['date'] >= '2020-03-10'

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((714, 2), (696, 2), (714,), (696,))

In [9]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

# Min df - minimo de vezes que palavra tem que aparecer pra virar coluna
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,2))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [10]:
title_bow_train.shape

(714, 1134)

In [11]:
# Concatenando as variávels numéricas com as geradas pelo TfidfVectorizer
Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])
Xtrain_wtitle.shape, Xval_wtitle.shape

((714, 1136), (696, 1136))

In [12]:
mdl = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, random_state=0, class_weight='balanced', n_jobs=4)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=4,
                       random_state=0)

In [13]:
p = mdl.predict_proba(Xval_wtitle)[: ,1]

In [14]:
metrics.average_precision_score(yval, p), metrics.roc_auc_score(yval, p)

(0.3969764593456903, 0.677681712479876)

### 5. LightGBM

In [15]:
from lightgbm import LGBMClassifier

In [16]:
mdl = LGBMClassifier(random_state=0, class_weight='balanced', n_jobs=4)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(class_weight='balanced', n_jobs=4, random_state=0)

In [17]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [18]:
metrics.average_precision_score(yval, p), metrics.roc_auc_score(yval, p)

(0.34044253688199166, 0.6430922425615646)

### 6. Bayesian Optimization

In [19]:
from skopt import forest_minimize

In [20]:
results = []
def tune_lgbm(params):
    tunning = {}
    print()
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)

    Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
    Xval_wtitle = sparse.hstack([Xval, title_bow_val])

    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                        random_state=0, class_weight='balanced', n_jobs=4)
    mdl.fit(Xtrain_wtitle, ytrain)
   
    pred = mdl.predict_proba(Xval_wtitle)[:, 1]
    print(metrics.roc_auc_score(yval, pred))
    tunning['params'] = params
    tunning['roc'] = metrics.roc_auc_score(yval, pred)
    tunning['avg_prec'] = metrics.average_precision_score(yval, pred)
    results.append(tunning)
    return -metrics.average_precision_score(yval, pred)


In [21]:
space = [(1e-3, 1e-1, 'log-uniform'), #lr
         (1,10), #max_depth
         (1,20), #min_child_samples
         (0.05, 1.), #subsample
         (0.05, 1.), #colsample_bytree
         (100, 1000), #n_estimators
         (1,5), # min_df
         (1,5)] # ngram_range

In [22]:
res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.

[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]




0.6575696142150139
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.3110
Function value obtained: -0.3645
Current minimum: -0.3645
Iteration No: 2 started. Evaluating function at random point.

[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]




0.644958559418043
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2844
Function value obtained: -0.3450
Current minimum: -0.3645
Iteration No: 3 started. Evaluating function at random point.

[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.6161171069107387
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3712
Function value obtained: -0.3215
Current minimum: -0.3645
Iteration No: 4 started. Evaluating function at random point.

[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.6190388170055454
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.0578
Function value obtained: -0.3335
Current minimum: -0.3645
Iteration No: 5 started. Evaluating function at random point.

[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]




0.6172023135173812
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.2398
Function value obtained: -0.2898
Current minimum: -0.3645
Iteration No: 6 started. Evaluating function at random point.

[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.6211555661558644
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 1.3212
Function value obtained: -0.3147
Current minimum: -0.3645
Iteration No: 7 started. Evaluating function at random point.

[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.65959692326039
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.7871
Function value obtained: -0.3562
Current minimum: -0.3645
Iteration No: 8 started. Evaluating function at random point.

[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]
0.6506648381134101
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1490
Function value obtained: -0.3640
Current minimum: -0.3645
Iteration No: 9 started. Evaluating function at random point.

[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.6124023612187705
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.5895
Function value obtained: -0.2866
Current minimum: -0.3645
Iteration No: 10 started. Evaluating function at random point.

[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.6503667044302665
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.5349
Function value obtained: -0.3171
Current minimum: -0.3645
Iteration No: 11 started. Evaluating function at random point.

[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]




0.6565082583030232
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.4956
Function value obtained: -0.3419
Current minimum: -0.3645
Iteration No: 12 started. Evaluating function at random point.

[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.6301174646711586
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.4731
Function value obtained: -0.3255
Current minimum: -0.3645
Iteration No: 13 started. Evaluating function at random point.

[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]




0.648828334625246
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.3856
Function value obtained: -0.3471
Current minimum: -0.3645
Iteration No: 14 started. Evaluating function at random point.

[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]
0.658124142865661
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.1005
Function value obtained: -0.3610
Current minimum: -0.3645
Iteration No: 15 started. Evaluating function at random point.

[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.6204221572953311
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.8976
Function value obtained: -0.3172
Current minimum: -0.3645
Iteration No: 16 started. Evaluating function at random point.

[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]
0.693924035537535
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.1036
Function value obtained: -0.3771
Current minimum: -0.3771
Iteration No: 17 started. Evaluating function at random point.

[0.0010383234748454694, 9, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.5953789279112754
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.4046
Function value obtained: -0.2994
Current minimum: -0.3771
Iteration No: 18 started. Evaluating function at random point.

[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]




0.6465327052650409
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.2427
Function value obtained: -0.3354
Current minimum: -0.3771
Iteration No: 19 started. Evaluating function at random point.

[0.0699516121742407, 9, 10, 0.6477856515609233, 0.8594430701440198, 616, 1, 1]




0.6273507840915866
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 1.0128
Function value obtained: -0.3320
Current minimum: -0.3771
Iteration No: 20 started. Evaluating function at random point.

[0.0014752743467850462, 5, 4, 0.9747950537021096, 0.982207187458162, 909, 2, 4]




0.6131775087949436
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 1.3557
Function value obtained: -0.3324
Current minimum: -0.3771
Iteration No: 21 started. Searching for the next optimal point.

[0.014934852659125829, 2, 20, 0.4621033256790714, 0.2217898302101986, 118, 5, 1]
0.6579273746347862




Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.3006
Function value obtained: -0.3658
Current minimum: -0.3771
Iteration No: 22 started. Searching for the next optimal point.

[0.07082476192522262, 4, 20, 0.2711648462527161, 0.1533960872833824, 344, 5, 1]
0.6119551606940552




Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.3469
Function value obtained: -0.2926
Current minimum: -0.3771
Iteration No: 23 started. Searching for the next optimal point.

[0.09318959443807255, 2, 16, 0.43811627103137935, 0.21909758309755356, 205, 5, 1]
0.6326754516725299




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.3133
Function value obtained: -0.3304
Current minimum: -0.3771
Iteration No: 24 started. Searching for the next optimal point.

[0.014101174657771673, 3, 19, 0.7121809663493952, 0.5954880344652083, 189, 2, 1]
0.639806809373323




Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.3319
Function value obtained: -0.3632
Current minimum: -0.3771
Iteration No: 25 started. Searching for the next optimal point.

[0.0117790322874868, 4, 19, 0.5929328516172685, 0.24005504701129232, 169, 4, 1]
0.6587144475582851




Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.3273
Function value obtained: -0.3656
Current minimum: -0.3771
Iteration No: 26 started. Searching for the next optimal point.

[0.026563982249110236, 4, 20, 0.5292714547580233, 0.47331059495798705, 208, 4, 1]
0.6460855047403256




Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.3905
Function value obtained: -0.3578
Current minimum: -0.3771
Iteration No: 27 started. Searching for the next optimal point.

[0.06967973173814067, 4, 8, 0.5403805593219373, 0.16137031859064052, 159, 5, 1]
0.6551070299922485




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.3355
Function value obtained: -0.3431
Current minimum: -0.3771
Iteration No: 28 started. Searching for the next optimal point.

[0.015740036590146427, 1, 18, 0.5791632990536987, 0.15905765279349326, 310, 5, 1]
0.6595909605867272




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.3291
Function value obtained: -0.3711
Current minimum: -0.3771
Iteration No: 29 started. Searching for the next optimal point.

[0.014125406192093213, 3, 19, 0.47315583339587314, 0.09409713984902018, 344, 5, 2]
0.6384950211674915




Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.3710
Function value obtained: -0.3443
Current minimum: -0.3771
Iteration No: 30 started. Searching for the next optimal point.

[0.004871334242637707, 7, 18, 0.9667940997212466, 0.3243702516590816, 121, 4, 1]
0.6593882296821895




Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.3566
Function value obtained: -0.3613
Current minimum: -0.3771
Iteration No: 31 started. Searching for the next optimal point.

[0.001862037165403463, 10, 20, 0.7594594688994479, 0.06663099351693305, 118, 3, 1]
0.678778844433844




Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.3509
Function value obtained: -0.4003
Current minimum: -0.4003
Iteration No: 32 started. Searching for the next optimal point.

[0.01976576194940411, 8, 18, 0.953326386334843, 0.15390708027847727, 115, 1, 3]
0.6531631983781526




Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.4134
Function value obtained: -0.3774
Current minimum: -0.4003
Iteration No: 33 started. Searching for the next optimal point.

[0.0740196757617546, 10, 18, 0.9126843362675201, 0.2834841274415109, 113, 4, 4]
0.6085266233379047




Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.4277
Function value obtained: -0.3077
Current minimum: -0.4003
Iteration No: 34 started. Searching for the next optimal point.

[0.031757396185583694, 6, 15, 0.21853423300046687, 0.1364153368912104, 125, 3, 1]
0.6460497286983483




Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.3685
Function value obtained: -0.3311
Current minimum: -0.4003
Iteration No: 35 started. Searching for the next optimal point.

[0.0012216888738411404, 9, 10, 0.8000959684386181, 0.0921452148410233, 130, 1, 3]
0.6315067676346073




Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.4250
Function value obtained: -0.3599
Current minimum: -0.4003
Iteration No: 36 started. Searching for the next optimal point.

[0.008631630476462334, 9, 20, 0.988690233262526, 0.12840274753114395, 276, 3, 1]
0.6713076143342673




Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.4231
Function value obtained: -0.3879
Current minimum: -0.4003
Iteration No: 37 started. Searching for the next optimal point.

[0.0017258500409867844, 9, 19, 0.6102690607378527, 0.12978903428630878, 337, 2, 1]
0.6522509093077336




Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.4141
Function value obtained: -0.3765
Current minimum: -0.4003
Iteration No: 38 started. Searching for the next optimal point.

[0.001057161569864939, 10, 17, 0.6718951513178328, 0.14211099300528046, 389, 3, 1]




0.6608550474032555
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.4806
Function value obtained: -0.3709
Current minimum: -0.4003
Iteration No: 39 started. Searching for the next optimal point.

[0.001136190340318591, 10, 19, 0.773898212780823, 0.15743843775893793, 536, 1, 1]




0.6594418937451554
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.5835
Function value obtained: -0.3785
Current minimum: -0.4003
Iteration No: 40 started. Searching for the next optimal point.

[0.0048857279688804, 10, 20, 0.7156837860476691, 0.127808500514754, 636, 2, 1]




0.6420428119968994
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.5823
Function value obtained: -0.3655
Current minimum: -0.4003
Iteration No: 41 started. Searching for the next optimal point.

[0.0019023125590401234, 3, 17, 0.778814631847701, 0.14437306403337846, 703, 3, 1]
0.6721066126050921




Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.4569
Function value obtained: -0.3866
Current minimum: -0.4003
Iteration No: 42 started. Searching for the next optimal point.

[0.0010747277017444188, 8, 16, 0.9708126393869809, 0.05177143064045592, 237, 4, 1]
0.6739192653986047




Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.4177
Function value obtained: -0.3479
Current minimum: -0.4003
Iteration No: 43 started. Searching for the next optimal point.

[0.0016711959115784437, 10, 20, 0.8332589929598432, 0.05354348512825167, 372, 3, 2]
0.6638661976030051




Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.4661
Function value obtained: -0.3836
Current minimum: -0.4003
Iteration No: 44 started. Searching for the next optimal point.

[0.002681733220441722, 5, 20, 0.9690855482360399, 0.25305348522161253, 822, 3, 2]




0.6560193190626676
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.6839
Function value obtained: -0.3586
Current minimum: -0.4003
Iteration No: 45 started. Searching for the next optimal point.

[0.017298334401955676, 9, 20, 0.6221501179754911, 0.13313694311715674, 745, 3, 3]




0.611048834297299
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.6313
Function value obtained: -0.3048
Current minimum: -0.4003
Iteration No: 46 started. Searching for the next optimal point.

[0.0020786590455062794, 4, 18, 0.322289727163339, 0.09997158610536157, 343, 3, 1]
0.660336294794586




Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.3579
Function value obtained: -0.3576
Current minimum: -0.4003
Iteration No: 47 started. Searching for the next optimal point.

[0.0031638248504240653, 10, 17, 0.6440167160702353, 0.07846538644901674, 936, 3, 2]




0.6650945083775565
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.7335
Function value obtained: -0.3796
Current minimum: -0.4003
Iteration No: 48 started. Searching for the next optimal point.

[0.0017723373063171426, 10, 18, 0.7817519228568389, 0.054225821346231697, 418, 4, 2]




0.645650229562936
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.4809
Function value obtained: -0.3484
Current minimum: -0.4003
Iteration No: 49 started. Searching for the next optimal point.

[0.053539787146168444, 7, 20, 0.9806497569171833, 0.0985142521272917, 290, 3, 1]
0.6529306541053008




Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.4117
Function value obtained: -0.3795
Current minimum: -0.4003
Iteration No: 50 started. Searching for the next optimal point.

[0.03137052946911072, 9, 20, 0.6125784338354515, 0.08679440024036054, 728, 1, 1]




0.6151332657563651
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.5825
Function value obtained: -0.3441
Current minimum: -0.4003


In [23]:
sorted(results, key = lambda i: i['avg_prec'],reverse=True)[0]

{'params': [0.001862037165403463,
  10,
  20,
  0.7594594688994479,
  0.06663099351693305,
  118,
  3,
  1],
 'roc': 0.678778844433844,
 'avg_prec': 0.4003489238281557}

### 7. Logistic Reg

In [26]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler

In [51]:
Xtrain_wtitle2 = sparse.csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = sparse.csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
scaler = MaxAbsScaler()

# Xtrain_wtitle2[: , :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
# Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
Xval_wtitle2 = scaler.transform(Xval_wtitle2)

In [52]:
Xval_wtitle2.shape

(696, 1136)

In [56]:
mdl = LogisticRegression(C=0.5, n_jobs=4, random_state=4)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=0.5, n_jobs=4, random_state=4)

In [57]:
p = mdl.predict_proba(Xval_wtitle2)[:, 1]

In [58]:
metrics.average_precision_score(yval, p), metrics.roc_auc_score(yval, p)

(0.41472090277819385, 0.6588873650945083)

In [59]:
# (0.4043414314912761, 0.6789338739490788) - sem tunning StandardScaler
# (0.3988238048468208, 0.6462226462345716) - sem tunning MaxAbScaler
# (0.33826219541849384, 0.6082881163913899) - C=10, MaxAbScaler
# (0.41472090277819385, 0.6588873650945083) - C=0.5 MaxAbScaler