In [1]:
import pandas as pd

df_train = pd.read_csv('./dataset/kaggle/train.csv')
df_test = pd.read_csv('./dataset/kaggle/test.csv')

In [2]:
import re
from bs4 import BeautifulSoup


def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # find title
    title = soup.body.h1.string.strip().lower()

    # find author
    article_info = soup.head.find('div', {'class': 'article-info'})
    author_name = article_info.find('span', {'class': 'author_name'})
    if author_name != None:
        author = author_name.get_text()
    elif article_info.span != None:
        author = article_info.span.string
    else:
        author = article_info.a.string

    # clean author
    author = re.sub('\s+', ' ', author.strip().lower())
    if author.startswith('by '):
        author = author[3:]
    author = re.sub('&.*;', '&', author.replace(' and ', ' & '))

    author_list = []
    if author.find(',') == -1:
        author_list = re.split('\s*&\s*', author)
    else:
        authors = re.split('\s*,\s*', author)
        if authors[-1].find('&') == -1 or len(authors[-1].split('&')[-1].strip().split()) > 3:
            author_list.append(authors[0])
        else:
            author_list += authors[:-1]
            author_list += re.split('\s*&\s*', authors[-1])
    author = ' '.join([re.sub('\s+', '_', a) for a in author_list])

    # find channel
    channel = soup.body.article['data-channel'].strip().lower()

    # find topic
    a_list = soup.body.find('footer', {'class': 'article-topics'}).find_all('a')
    topic_list = [a.string.strip().lower() for a in a_list]
    topic = ' '.join([re.sub('\s+', '_', t) for t in topic_list])

    # find datetime
    article_info = soup.head.find('div', {'class': 'article-info'})
    try:
        date_time = article_info.time['datetime']
    except:
        date_time = 'Wed, 10 Oct 2014 15:00:43'
    match_obj = re.search('([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)
    day, date, month, year, hour, minute, second = match_obj.groups()
    day, month = day.lower(), month.lower()

    # find content
    content = soup.body.find('section', {'class': 'article-content'}).get_text()
    content_len = len(content)

    # find see also
    num_see_also = len(re.findall('see also', content.lower()))

    # find image
    num_image = len(soup.body.find_all('img'))

    # find a
    num_a = len(soup.body.find_all('a'))

    return title, author, channel, topic, day, date, month, year, \
        hour, minute, second, content_len, num_see_also, num_image, num_a


feature_list = []
for text in df_train['Page content']:
    feature_list.append(preprocessor(text))
for text in df_test['Page content']:
    feature_list.append(preprocessor(text))

df_combine = pd.DataFrame(
    feature_list,
    columns=['Title', 'Author', 'Channel', 'Topic', 'Day', 'Date', 'Month', 'Year',
             'Hour', 'Minute', 'Second', 'Content_Len', 'Num_See_Also', 'Num_Image', 'Num_A']
)

  author = re.sub('\s+', ' ', author.strip().lower())
  author_list = re.split('\s*&\s*', author)
  authors = re.split('\s*,\s*', author)
  author_list += re.split('\s*&\s*', authors[-1])
  author = ' '.join([re.sub('\s+', '_', a) for a in author_list])
  topic = ' '.join([re.sub('\s+', '_', t) for t in topic_list])
  match_obj = re.search('([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)


In [3]:
df_combine.head()

Unnamed: 0,Title,Author,Channel,Topic,Day,Date,Month,Year,Hour,Minute,Second,Content_Len,Num_See_Also,Num_Image,Num_A
0,nasa's grand challenge: stop asteroids from de...,clara_moskowitz,world,asteroid asteroids challenge earth space u.s. ...,wed,19,jun,2013,15,4,30,3591,4,1,21
1,google's new open source patent pledge: we won...,christina_warren,tech,apps_and_software google open_source opn_pledg...,thu,28,mar,2013,17,40,55,1843,1,1,16
2,ballin': 2014 nfl draft picks get to choose th...,sam_laird,entertainment,entertainment nfl nfl_draft sports television,wed,7,may,2014,19,15,20,6646,1,1,9
3,cameraperson fails deliver slapstick laughs,sam_laird,watercooler,sports video videos watercooler,fri,11,oct,2013,2,26,50,1821,1,0,11
4,nfl star helps young fan prove friendship with...,connor_finnegan,entertainment,entertainment instagram instagram_video nfl sp...,thu,17,apr,2014,3,31,43,8919,1,51,14


In [4]:
day_map = {'mon': 1, 'tue': 2, 'wed': 3,
           'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}
month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

df_copy = df_combine.copy()
df_copy['Day'] = df_copy['Day'].map(day_map)
df_copy['Month'] = df_copy['Month'].map(month_map)

df_copy = df_copy.drop(columns=['Title', 'Channel', 'Minute', 'Second', 'Num_See_Also', 'Num_Image', 'Num_A'])

In [5]:
df_copy.head()

Unnamed: 0,Author,Topic,Day,Date,Month,Year,Hour,Content_Len
0,clara_moskowitz,asteroid asteroids challenge earth space u.s. ...,3,19,6,2013,15,3591
1,christina_warren,apps_and_software google open_source opn_pledg...,4,28,3,2013,17,1843
2,sam_laird,entertainment nfl nfl_draft sports television,3,7,5,2014,19,6646
3,sam_laird,sports video videos watercooler,5,11,10,2013,2,1821
4,connor_finnegan,entertainment instagram instagram_video nfl sp...,4,17,4,2014,3,8919


In [6]:
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')


def tokenizer(text):
    if type(text) == np.ndarray:
        text = text[0]
    return re.split('\s+', text.strip())


def tokenizer_wnl(text):
    if type(text) == np.ndarray:
        text = text[0]
    text = re.sub("([\w]+)'[\w]+",
                  (lambda match_obj: match_obj.group(1)), text)
    text = re.sub('\.', '', text)
    text = re.sub('[^\w]+', ' ', text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(s) for s in re.split('\s+', text.strip())]

  return re.split('\s+', text.strip())
  text = re.sub("([\w]+)'[\w]+",
  text = re.sub('\.', '', text)
  text = re.sub('[^\w]+', ' ', text)
  return [wnl.lemmatize(s) for s in re.split('\s+', text.strip())]
[nltk_data] Downloading package wordnet to C:\Users\Cark C3
[nltk_data]     PVT\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\Cark C3
[nltk_data]     PVT\AppData\Roaming\nltk_data...


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

trans_forest = ColumnTransformer(
    [('Author', CountVectorizer(tokenizer=tokenizer, lowercase=False), [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_wnl, lowercase=False), [1])],
    n_jobs=-1,
    remainder='passthrough'
)

trans_other = ColumnTransformer(
    [('Author', 'drop', [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_wnl, lowercase=False), [1])],
    n_jobs=-1,
    remainder='passthrough'
)

In [8]:
from sklearn.model_selection import train_test_split

X_train_raw = df_copy.values[:df_train.shape[0]]
y_train_raw = (df_train['Popularity'].values == 1).astype(int)
X_test = df_copy.values[df_train.shape[0]:]

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_raw, y_train_raw, test_size=0.2, random_state=0)

In [9]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score


def training(clf):
    cv_results = cross_validate(clf, X_train_raw, y_train_raw,
                                scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.5f} (+/-{:.5f})'.format(
        np.mean(cv_results['train_score']), np.std(cv_results['train_score'])))
    print('valid score: {:.5f} (+/-{:.5f})'.format(
        np.mean(cv_results['test_score']), np.std(cv_results['test_score'])))

    clf.fit(X_train, y_train)
    print('train score: {:.5f}'.format(roc_auc_score(
        y_train, clf.predict_proba(X_train)[:, 1])))
    print('valid score: {:.5f}'.format(roc_auc_score(
        y_valid, clf.predict_proba(X_valid)[:, 1])))
    return clf

In [12]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

lgbm = Pipeline([('ct', trans_other),
                 ('clf', LGBMClassifier(random_state=0, learning_rate=0.009, n_estimators=300))])
lgbm = training(lgbm)

[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 596
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315
[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 584
[LightGBM] [Info

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

forest = Pipeline([('ct', trans_forest),
                   ('clf', RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators=300))])
forest = training(forest)

train score: 1.00000 (+/-0.00000)
valid score: 0.58552 (+/-0.00989)
train score: 1.00000
valid score: 0.58378


In [16]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

xgboost = Pipeline([('ct', trans_other),
                    ('clf', XGBClassifier(verbosity=0, n_estimators=300))])
xgboost = training(xgboost)

train score: 0.81282 (+/-0.00500)
valid score: 0.58232 (+/-0.01309)
train score: 0.81618
valid score: 0.57805


In [25]:
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

catboost = Pipeline([('ct', trans_other),
                     ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=290, learning_rate=0.06))])
catboost = training(catboost)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [23]:
pip install --upgrade numpy catboost


Collecting numpy
  Downloading numpy-2.1.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Note: you may need to restart the kernel to use updated packages.


In [26]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([('lgbm', lgbm), ('forest', forest), ('catboost', catboost)],
                          voting='soft', weights=[1, 0.2, 0.05])
voting = training(voting)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'numpy._core.numeric'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 66, in inner_f
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_voting.py", line 423, in fit
    return super().fit(X, transformed_y, **fit_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_voting.py", line 104, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_base.py", line 40, in _fit_single_estimator
    estimator.fit(X, y, **fit_params)
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 976, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 885, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Cark C3 PVT\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 763, in _return_or_raise
    raise self._result
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.


In [None]:
best_model = voting

y_score = best_model.predict_proba(X_test)[:, 1]
df_pred = pd.DataFrame({'Id': df_test['Id'], 'Popularity': y_score})
df_pred.to_csv('test_pred.csv', index=False)