# Let's find how good is your medium article!

I've done this kernel as a part of the 6th assignment of [mlcourse.ai](http://mlcourse.ai). <br>
In this particular kernel i use no hacks (the one with the all 0 submission) and got the MAE ~ 1.76 on the leader board. <br>
The very same submission but with the all 0's hack gives around 1.46.

Importing all the necessary modules: 

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk import (PorterStemmer, WordNetLemmatizer)
from sklearn.feature_extraction.text import (CountVectorizer,
                                             TfidfVectorizer)
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import (RidgeCV, Ridge)
from sklearn.model_selection import (GridSearchCV, 
                                     cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler
from scipy.sparse import (csr_matrix, hstack)

Just a bunch of constantes for easier access to files in kernel:

In [None]:
PATH_TO_DATA = '../input' # modify this if you need to
TARGET_FILE = 'train_log1p_recommends.csv'
TARGET_PATH = os.path.join(PATH_TO_DATA, TARGET_FILE)
TRAIN_FILE = 'train.json'
TRAIN_PATH = os.path.join(PATH_TO_DATA, TRAIN_FILE)
TEST_FILE = 'test.json'
TEST_PATH = os.path.join(PATH_TO_DATA, TEST_FILE)

Some functions and parts of code are actually reused from [this baseline](http://www.kaggle.com/kashnitsky/ridge-countvectorizer-baseline)

The following code will help to throw away all HTML tags from an article content.

In [None]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
        
    def handle_data(self, d):
        self.fed.append(d)
        
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

Supplementary function to read a JSON line without crashing on escape characters.

In [None]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

And a bunch of functions for feature creation and extraction from .json files

In [None]:
def get_authors(path_to_file):
    authors = list()
    with open(path_to_file, encoding='utf-8') as inp_json_file:
        for line in inp_json_file:
            json_data = read_json_line(line)
            authors.append(json_data['author']['url'].split('@')[1])
    return authors

In [None]:
def extract_published_date(path_to_file):
    dates = list()
    with open(path_to_file, encoding='utf-8') as inp_json_file:
        for line in inp_json_file:
            json_data = read_json_line(line)
            dates.append(json_data['published']['$date'])
    dates_df = pd.DataFrame(dates, columns=['date'])
    dates_df['date'] = pd.to_datetime(dates_df['date'])
    return dates_df

In [None]:
def add_time_features(df, X_sparse):
    hour = df['date'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    weekday = df['date'].apply(lambda ts: ts.weekday())
    is_monday = (weekday == 0).astype('int')
    is_tuesday = (weekday == 1).astype('int')
    is_wednesday = (weekday == 2).astype('int')
    is_thursday = (weekday == 3).astype('int')
    is_friday = (weekday == 4).astype('int')
    is_weekend = (weekday >= 5).astype('int')
    
    X = hstack([X_sparse,
                morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1),
                evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1),
                is_monday.values.reshape(-1, 1),
                is_tuesday.values.reshape(-1, 1),
                is_wednesday.values.reshape(-1, 1),
                is_thursday.values.reshape(-1, 1),
                is_friday.values.reshape(-1, 1),
                is_weekend.values.reshape(-1, 1)]).tocsr()
    return X

In [None]:
def get_contents(path_to_file):
    contents = list()
    with open(path_to_file, encoding='utf-8') as inp_json_file:
        for line in inp_json_file:
            json_data = read_json_line(line)
            content = json_data['content']  
            contents.append(content)
    return contents

In [None]:
def get_content_features(contents):
    content_lengths = list()
    h1_counts = list()
    h2_counts = list()
    h3_counts = list()
    img_counts = list()
    href_counts = list()
    
    for content in contents:
        content_stripped = strip_tags(content)   
        content_length = len(content_stripped.split())
        content_lengths.append(content_length)
        h1_counts.append(content.count('<h1'))
        h2_counts.append(content.count('<h2'))
        h3_counts.append(content.count('<h3'))
        img_counts.append(content.count('<img'))
        href_counts.append(content.count('<href'))
        
    counts = np.hstack([np.array(h1_counts).reshape(-1, 1),
                    np.array(h2_counts).reshape(-1, 1),
                    np.array(h3_counts).reshape(-1, 1),
                    np.array(img_counts).reshape(-1, 1),
                    np.array(href_counts).reshape(-1, 1)])
    
    content_lengths = np.array(content_lengths)
    is_short = (content_lengths<1350).astype('int')
    is_medium = ((content_lengths>=1350) & (content_lengths<2700)).astype('int')
    is_long = ((content_lengths>=2700) & (content_lengths<6750)).astype('int')
    is_huge = (content_lengths>=6750).astype('int')
    
    length_types = np.hstack([is_short.reshape(-1, 1),
                              is_medium.reshape(-1, 1),
                              is_long.reshape(-1, 1),
                              is_huge.reshape(-1, 1) ])
    
    return counts, length_types

In [None]:
def get_titles(path_to_file):
    titles = list()
    with open(path_to_file, encoding='utf-8') as inp_json_file:
        for line in inp_json_file:
            json_data = read_json_line(line)
            title = json_data['title']
            titles.append(title)
    return titles

In [None]:
def get_title_features(titles):
    titles_lengths = np.array([len(title.split()) for title in titles])
    is_short = (titles_lengths<6).astype('int')
    is_medium = ((titles_lengths>=6) & (titles_lengths<11)).astype('int')
    is_long = ((titles_lengths>=11) & (titles_lengths<20)).astype('int')
    is_huge = (titles_lengths>=20).astype('int')
    
    length_types = np.hstack([is_short.reshape(-1, 1),
                              is_medium.reshape(-1, 1),
                              is_long.reshape(-1, 1),
                              is_huge.reshape(-1, 1) ])
    return length_types

In [None]:
class StemmingLemmatizingTokenizer(object):
    
    def __init__(self, stemmer=PorterStemmer, lemmatizer=WordNetLemmatizer):
        self.stemmer = stemmer()
        self.lemmatizer = lemmatizer()
        
    def __call__(self, doc):
        # strings of punctuation signs and digits
        from string import punctuation, digits
        # some other unicode chars i found in the content
        other_unicode_chars = '’’”“\u200b'
        chars_to_remove = ''.join((punctuation,
                                   digits,
                                   other_unicode_chars))
        # getting rid of punctuation signs and digits
        transtab = str.maketrans(chars_to_remove, ' '*len(chars_to_remove))
        # goiinf through all tokens with 3 or more chars
        # lemmatizing the verbs first, then stemming all words
        return [self.stemmer.stem(self.lemmatizer.lemmatize(token, pos='v')) 
                for token in word_tokenize(doc.translate(transtab)) 
                if len(token) >= 3]

In [None]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS
temp = []
s = StemmingLemmatizingTokenizer()
for eggs in stop_words:
    token = s(eggs)
    if token:
        temp += token
stop_words = temp

Initializing the vectorizers and scalers since we want our features from test set be scaled and vectorized according to train set.

In [None]:
author_vectorizer = CountVectorizer()
counts_scaler = StandardScaler()
content_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                     tokenizer=StemmingLemmatizingTokenizer(),
                                     stop_words=stop_words,
                                     max_features=200000)
title_vectorizer = TfidfVectorizer(ngram_range=(2, 3), 
                                   tokenizer=StemmingLemmatizingTokenizer(),
                                   stop_words=stop_words,
                                   max_features=200000)

And let's create the train data set!

In [None]:
%%time
authors = get_authors(TRAIN_PATH)
author_sparse = author_vectorizer.fit_transform(authors)

In [None]:
%%time
date_df = extract_published_date(TRAIN_PATH)
train_data = add_time_features(date_df, author_sparse)

In [None]:
%%time
raw_contents = get_contents(TRAIN_PATH)

In [None]:
%%time
counts, length_types = get_content_features(raw_contents)
counts_scaled = counts_scaler.fit_transform(counts)

In [None]:
%%time
content_sparse = content_vectorizer.fit_transform((strip_tags(content) 
                                                   for content in raw_contents))

In [None]:
%%time
titles = get_titles(TRAIN_PATH)
title_length_types = get_title_features(titles)

In [None]:
%%time
title_sparse = title_vectorizer.fit_transform(titles)

In [None]:
train_data = hstack([train_data,
                     title_sparse,
                     title_length_types,
                     content_sparse,
                     counts_scaled,
                     length_types]).tocsr()

Extracting the target - which is the log(number of recommends)

In [None]:
train_target = pd.read_csv(TARGET_PATH, index_col='id')
train_target = train_target['log_recommends'].values

Just checking how good the basic Ridge model doest on the train set.

In [None]:
%%time
ridge = Ridge()
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, random_state=17)
ridge.fit(X_train, y_train);
ridge_pred = ridge.predict(X_test)
plt.hist(y_test, bins=30, alpha=.5, color='red',
         label='true values', range=(0,10));
plt.hist(ridge_pred, bins=30, alpha=.5, color='green',
         label='predicted values', range=(0,10));
plt.legend();
valid_mae = mean_absolute_error(y_test, ridge_pred)
print(valid_mae, np.expm1(valid_mae))

Now we training our models with the full train data.

In [None]:
%%time
alphas = (0.005, 0.01, 0.5, 0.1, 1)
ridge = RidgeCV(alphas=alphas, cv=5, gcv_mode='auto',
                scoring='neg_mean_absolute_error')
ridge.fit(train_data, train_target);
print('alpha: ', ridge.alpha_) # i'm just curious what it would be

And let's create test data finally.

In [None]:
%%time
authors = get_authors(TEST_PATH)
author_sparse = author_vectorizer.transform(authors)

date_df = extract_published_date(TEST_PATH)
test_data = add_time_features(date_df, author_sparse)

raw_contents = get_contents(TEST_PATH)
counts, length_types = get_content_features(raw_contents)
counts_scaled = counts_scaler.transform(counts)

stripped_contents = [strip_tags(content) for content in raw_contents]
content_sparse = content_vectorizer.transform(stripped_contents)

titles = get_titles(TEST_PATH)
title_length_types = get_title_features(titles)
title_sparse = title_vectorizer.transform(titles)

test_data = hstack([test_data, 
                    title_sparse,
                    title_length_types,
                    content_sparse, 
                    counts_scaled,
                    length_types]).tocsr()

Predicting, writing to file and checking the public score!

In [None]:
ridge_test_pred = ridge.predict(test_data)

In [None]:
def write_submission_file(prediction, filename,
    path_to_sample=os.path.join(PATH_TO_DATA, 'sample_submission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='id')
    
    submission['log_recommends'] = prediction
    submission.to_csv(filename)

In [None]:
write_submission_file(prediction=ridge_test_pred, 
                      filename='ridgeCV_200k_stemming_lemmatizing.csv')

As i've said earlier we can do a little hack on this assignment. <br>
If we submit all zeroes submission we will get the mean log recommends from test set. <br>
How to use it it's only yours decision!