In [1]:
import numpy as np
import pandas as pd

import os
import json
from tqdm import tqdm_notebook
from html.parser import HTMLParser

from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# throwing away HTML tags from article content

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
### for read JSON line without crashing on escape parameters

def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [4]:
features_to_extract = ['content', 'published', 'title', 'author']

def extract_features_and_write(input_path, input_filename, output_path, is_train=True):
    
    prefix = 'train' if is_train else 'test'
    
    dataframes = [pd.DataFrame() for feat in features_to_extract]
    feature_filenames = [os.path.join(output_path, '{}_{}.csv'.format(prefix, feat)) for feat in features_to_extract]
    
    with open(os.path.join(input_path, input_filename), encoding='utf-8') as input_json_file:
        
        for line in tqdm_notebook(input_json_file):
            json_data = read_json_line(line)
            
            for idx, feat in enumerate(features_to_extract):
                
                data = json_data[feat]
                
                if feat == 'content':
                    data = strip_tags(data)
                    df_line = {feat: data}
                elif feat == 'published':
                    df_line = {feat: pd.to_datetime(data['$date'])}
                elif feat == 'title':
                    df_line = {feat: data}
                elif feat == 'author':
                    df_line = data
                
                dataframes[idx] = dataframes[idx].append(df_line, ignore_index=True)
    
    for idx, feat_filename in enumerate(feature_filenames):
        dataframes[idx].to_csv(feat_filename, columns=dataframes[idx].columns, encoding='utf-8')
            

In [5]:
extract_features_and_write(input_path='./data/raw', 
                           input_filename='train.json', 
                           output_path='./data/processed', 
                           is_train=True)




In [6]:
extract_features_and_write(input_path='./data/raw', 
                           input_filename='test.json', 
                           output_path='./data/processed', 
                           is_train=False)




### make tfidf to 'content' and save

In [None]:
df_train_content = pd.read_csv('./data/processed/train_content.csv', encoding='utf-8', index_col=0)
df_test_content = pd.read_csv('./data/processed/test_content.csv', encoding='utf-8', index_col=0)

In [None]:
content_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000)

In [None]:
%%time

content_tfidf.fit(df_train['content'])

content_train = content_tfidf.transform(df_train['content'])
content_test = content_tfidf.transform(df_test['content'])

In [None]:
sparse.save_npz('./data/processed/train_content_tfidf.npz', content_train)
sparse.save_npz('./data/processed/test_content_tfidf.npz', content_test)