# Import necessary package

In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# This function is used to generate the dataset from csv file 

In [2]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

# HTML Preprocessor
We preprocessed some features from original HTML file. Include
* Time
* Category
* Data Channel

# Conver time from string to integer, make it possible to be useful feature

In [3]:
def preprocessor_time(text):
    ts = BeautifulSoup(text, 'html.parser').find_all('time')
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def generate_time_num_feature(raw_text):
    feature = []
    for r_t in raw_text:
        time_text = preprocessor_time(r_t).split(' ')
        f = []
        for i in range(4):
            try:
                time = int(time_text[i])
            except:
                time = 0
            f.append(time)
        try:
            temp = pd.Timestamp(time_text[0]+'-'+time_text[1]+'-'+time_text[2])
            f.append(temp.dayofweek)
        except:
            f.append(-1)
        f = np.array(f)
        feature.append(f)
    feature = np.array(feature)
    return feature

# This part is used to generate the frequency of specific category

In [4]:
def get_frequency(text_, y):
    cnt = 1
    for label, i in tqdm(zip(y, text_)):
        soup = BeautifulSoup(i)
        for link in soup.find_all('a'):
            t = link.get('href')
            if t != None:
                if t.find('category') != -1 and t.find('http') == -1:
                    for idx, i in enumerate(t.split('/')):
                        if i == 'category': 
                            tag = t.split('/')[idx+1]
                            if tag not in frequency:
                                dict_[tag] = cnt
                                frequency[tag] = 1
                                average_label[tag] = label
                                cnt += 1
                            else:
                                frequency[tag] += 1
                                average_label[tag] += label 


def preprocessor_category_frequency(text_):
    soup = BeautifulSoup(text_)
    text = []
    for link in soup.find_all('a'):
        t = link.get('href')
        if t != None:
            if t.find('category') != -1 and t.find('http') == -1:
                for idx, i in enumerate(t.split('/')):
                    if i == 'category':
                        tag = t.split('/')[idx+1]
                        if tag in frequency and abs(average_label[tag]) > 50:
                            text.append(average_label[tag])
    return text, len(text)


def generate_cate_num_feature(raw_text):
    feature = []
    max_len = -1
    for r_t in tqdm(raw_text):
        href_num, len_ = preprocessor_category_frequency(r_t)
        if max_len < len_:
            max_len = len_
        feature.append(href_num[0:5])
    for idx, r_t in enumerate(feature):
        for i in range(0, 5 - len(r_t)):
            feature[idx].append(0)
    feature = np.array(feature)
    return feature

# extract the data channel

In [5]:
def preprocess_datachannel(text_):
    soup = BeautifulSoup(text_)
    pa = soup.find('article')
    text = pa.get('data-channel')
    if text == None:
        text = 'None'
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text 

def get_datachannel_frequency(raw_text, y):
    feature = []
    for r_t in tqdm(raw_text):
        img_num = preprocess_datachannel(r_t)
        feature.append(img_num)
    for tag, label in tqdm(zip(feature, y)):
        if tag not in frequency_data_channel:
            frequency_data_channel[tag] = 1
            data_channel_average[tag] = label
        else:
            frequency_data_channel[tag] += 1
            data_channel_average[tag] += label


def generate_datachannel(raw_text):
    out = []
    for r_t in raw_text:
        text = preprocess_datachannel(r_t)
        if text in frequency_data_channel:
            img_num = data_channel_average[text]
            out.append(img_num)   
        else:
            out.append(-1)
    feature = np.array(out)
    feature = feature.reshape(feature.shape[0], 1)
    return feature

# Model

In [6]:
class numerical_estimator():
    def __init__(self):
        self.pipes = []
        self.pipes.append(generate_time_num_feature)
        self.pipes.append(generate_cate_num_feature)
        self.pipes.append(generate_datachannel)
        self.estimator = XGBClassifier(n_estimators=5000, max_depth=5, gamma=6, objective='binary:logistic', eval_metric='auc')#RandomForestClassifier(max_depth=8, random_state=0, min_samples_leaf=10,min_samples_split=20, n_estimators=1000)
        self.scaler = StandardScaler()
    def fit(self, X, Y):
        
        self.features = []
        for p in self.pipes:
            self.features.append(p(X))
        self.all_feature = np.concatenate(self.features, axis=1)
        print(self.all_feature.shape)
        self.scaler.fit(self.all_feature)
        self.all_feature=self.scaler.transform(self.all_feature)
        self.estimator.fit(self.all_feature, Y)
    def predict(self, X):
        self.features = []
        for p in self.pipes:
            self.features.append(p(X))
        self.all_feature = np.concatenate(self.features, axis=1)
        self.all_feature=self.scaler.transform(self.all_feature)
        print(self.all_feature.shape)
        return self.estimator.predict(self.all_feature)
    def predict_proba(self, X):
        self.features = []
        for p in self.pipes:
            self.features.append(p(X))
        self.all_feature = np.concatenate(self.features, axis=1)
        self.all_feature=self.scaler.transform(self.all_feature)
        print(self.all_feature.shape)
        return self.estimator.predict_proba(self.all_feature)

# Training Process

In [10]:
stream = get_stream(path='./train.csv', size=22000)
classes = np.array([-1, 1])
batch = next(stream)
X_train, y_train = batch['Page content'], batch['Popularity']

average_label = {}
data_channel_average = {}
frequency = {}
frequency_data_channel = {}
dict_ = {}
get_frequency(X_train, y_train)
get_datachannel_frequency(X_train, y_train)

model = numerical_estimator()
model.fit(X_train, y_train)
score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])

print('train: {}'.format(score))

def get_stream_from_x(path, size, specific_x):
    count = 0
    chunk_list = []
    for chunk in pd.read_csv(path, chunksize=size, iterator=True):
        if(count < specific_x):
            print(count)
            count+=1
        else:
            chunk_list.append(chunk)
    df = pd.concat(chunk_list)
    return df

batch = get_stream_from_x(path='./train.csv', size=22000, specific_x=1)
X_val, y = batch['Page content'], batch['Popularity']

score = roc_auc_score(y, model.predict_proba(X_val)[:,1])

print('validation: {}'.format(score))

22000it [01:05, 334.41it/s]
100%|██████████| 22000/22000 [01:04<00:00, 343.74it/s]
22000it [00:00, 2132729.81it/s]
100%|██████████| 22000/22000 [01:04<00:00, 339.69it/s]


(22000, 11)


100%|██████████| 22000/22000 [01:05<00:00, 338.41it/s]


(22000, 11)
train: 0.6352969482075104
0


100%|██████████| 5643/5643 [00:16<00:00, 346.75it/s]


(5643, 11)
validation: 0.6198783031281242


# Write output

In [80]:
df_test = pd.read_csv('./test.csv')
df_test_pred = pd.DataFrame(columns=['Id', 'Popularity'])
X_test= df_test['Page content']

y_pred = model.predict_proba(X_test)
i = 0
for y in y_pred:
    print(df_test['Id'][i], y[1])
    df_test_pred = df_test_pred.append({'Id': int(df_test['Id'][i]), 'Popularity':y[1]}, ignore_index=True)
    i+=1
df_test_pred['Id'] = df_test_pred['Id'].astype('int')
df_test_pred.to_csv("./num_superXGB_ALL3.csv", index=False)

100%|███████████████████████████████████████████████████████████████████████████| 11847/11847 [00:42<00:00, 276.60it/s]


(11847, 11)
27643 0.40801543
27644 0.41718253
27645 0.43180516
27646 0.62999976
27647 0.4531927
27648 0.43458065
27649 0.61135745
27650 0.74415475
27651 0.756093
27652 0.5380286
27653 0.5018427
27654 0.7776527
27655 0.41276345
27656 0.5780408
27657 0.40772924
27658 0.5491435
27659 0.47923982
27660 0.65379894
27661 0.5263879
27662 0.47285333
27663 0.4943906
27664 0.58678424
27665 0.5826525
27666 0.45412096
27667 0.51041377
27668 0.49462277
27669 0.60558236
27670 0.52169454
27671 0.56834245
27672 0.47020808
27673 0.4100436
27674 0.5308885
27675 0.41167688
27676 0.5882036
27677 0.43989187
27678 0.42140934
27679 0.42425624
27680 0.52790344
27681 0.44078952
27682 0.4876263
27683 0.5053579
27684 0.52121615
27685 0.50240827
27686 0.42765012
27687 0.5004773
27688 0.43983555
27689 0.43458065
27690 0.40785745
27691 0.5570948
27692 0.40785912
27693 0.4918864
27694 0.6581338
27695 0.62833655
27696 0.5934689
27697 0.39144856
27698 0.5446564
27699 0.48511654
27700 0.44072276
27701 0.4372274
27702 0.