In [1]:
import numpy as np
## for data
import pandas as pd
import collections
import json
import string 
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for text processing
import re
import nltk
from nltk.tokenize import word_tokenize
## for sentiment
from textblob import TextBlob
## for ner, pos
import spacy
nlp = spacy.load("en_core_web_lg")
nltk.download('wordnet')
## parameters searching
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
## rmse
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annazhukovets/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import os
import logging
# a function  to create and save logs in the log files
def log(path, file):
    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [obj] -- [logger that record logs]
    """

    # check if the file exist
    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(levelname)s %(message)s"
    file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    # configure logger
    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    # create a file handler for output file
    handler = logging.FileHandler(log_file)

    # set the logging level for log file
    handler.setLevel(logging.INFO)
    
    # create a logging format
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)

    # add the handlers to the logger
    logger.addHandler(handler)

    return logger

In [3]:
train = pd.read_csv('comlit_data/train.csv')
train

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [4]:
X = train.drop(['target'], axis=1)
y = train['target']

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [6]:
# n_splits = 5
# n_repeats = 2
# group_count = 10
# cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
#                            group_count = group_count, random_state = 0, strategy = 'quantile')

# for train_index, test_index in cv.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

In [7]:
# from matplotlib import pyplot


# pyplot.hist(y_train, bins = 10, alpha=0.5, label='train')
# pyplot.hist(y_test, bins = 10, alpha=0.5, label='test');

In [8]:
from sklearn.base import BaseEstimator

# define the class FeatureGenerator
# to generate new features
class FeatureGenerator(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, x_dataset, y=None):
        return self

    def transform(self, x_dataset):
        
        ner = spacy.load("en_core_web_lg")
        # utils function to count the element of a list
        def utils_lst_count(lst):
            dic_counter = collections.Counter()
            for x in lst:
                dic_counter[x] += 1
            dic_counter = collections.OrderedDict(
                       sorted(dic_counter.items(),
                       key=lambda x: x[1], reverse=True))
            lst_count = [ {key:value} for key,value in dic_counter.items() ]
            return lst_count
        
        # utils function create new column for each tag category
        def utils_new_features(lst_dics_tuples, tag):
            if len(lst_dics_tuples) > 0:
                tag_type = []
                for dic_tuples in lst_dics_tuples:
                    for tuple in dic_tuples:
                        type, n = tuple[1], dic_tuples[tuple]
                        tag_type = tag_type + [type]*n
                        dic_counter = collections.Counter()
                        for x in tag_type:
                            dic_counter[x] += 1
                return dic_counter[tag]
            else:
                return 0

            
        # num of words in excerpt
        x_dataset['word_count'] = x_dataset["excerpt"].apply(lambda x: len(str(x).split(" ")))
        # num of chars in excerpt
        x_dataset['char_count'] = x_dataset["excerpt"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
        # num of sentences in excerpt
        x_dataset['sentence_count'] = x_dataset["excerpt"].apply(lambda x: len(str(x).split(".")))
        # avg word len in excerpt
        x_dataset['avg_word_length'] = x_dataset['char_count'] / x_dataset['word_count']
        # avg sentence len in excerpt
        x_dataset['avg_sentence_lenght'] = x_dataset['word_count'] / x_dataset['sentence_count']
        # sentiment index of excerpt
        x_dataset["sentiment"] = x_dataset["excerpt"].apply(lambda x: TextBlob(x).sentiment.polarity)
        # preprocessing for nlp
        x_dataset["nlp_text"] = x_dataset["excerpt"].apply(lambda x: nlp(x))
        # ner tag text and exctract tags into a list
        x_dataset["ner_tags"] = x_dataset["nlp_text"].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in x.ents] )
        # count ner tags
        x_dataset["ner_tags"] = x_dataset["ner_tags"].apply(lambda x: utils_lst_count(x))
        # extract ner features
        ner_tags_set = ['QUANTITY', 'MONEY', 'GPE',
                    'NORP', 'CARDINAL', 'LOC',
                    'ORDINAL', 'PRODUCT', 'FAC',
                    'LANGUAGE', 'TIME', 'LAW',
                    'EVENT', 'ORG', 'PERCENT',
                    'WORK_OF_ART', 'PERSON', 'DATE']
        for feature in ner_tags_set:
            x_dataset["ner_tags_" + feature] = x_dataset["ner_tags"].apply(lambda x:
                                                                 utils_new_features(x, feature))
        
        # pos tag text and exctract tags into a list
        x_dataset["pos_tags"] = x_dataset["nlp_text"].apply(lambda x: [(token.text, token.tag_) 
                                for token in x] )
        # count pos tags
        x_dataset["pos_tags"] = x_dataset["pos_tags"].apply(lambda x: utils_lst_count(x))
        # extract pos features
        pos_tags_set = ['CC', 'POS', 'WDT', 'VBP', 'FW', ':', 'PRP$',
                    'WRB', 'PRP', 'RP', 'RBS', 'NNP', 'CD', 'EX', 'PDT',
                    'VBN', 'WP$', 'JJ', 'SYM', 'VBG', 'VB', 'JJS', 'VBD',
                    'WP', ',', 'NNS', 'NN', 'VBZ', 'MD', 'RB', 'DT',
                    'JJR', 'UH', 'NNPS', 'TO', 'RBR']
    
        for feature in pos_tags_set:
            x_dataset["pos_tags_" + feature] = x_dataset["pos_tags"].apply(lambda x:
                                                                 utils_new_features(x, feature))
    
        return x_dataset

In [9]:
# define the class IDFVectorizer
# to generate new feature with mean of idf
class IDFVectorizer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, x_dataset, y=None):
        return self

    def transform(self, x_dataset):
        
        # removal of punctuation
        PUNCT_TO_REMOVE = string.punctuation
        def remove_punctuation(text):
            return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
        
        # removal of stopwords
        from nltk.corpus import stopwords
        ", ".join(stopwords.words('english'))
        STOPWORDS = set(stopwords.words('english'))
        def remove_stopwords(text):
            return " ".join([word for word in str(text).split() if word not in STOPWORDS])
        
        # lemmatization 
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        
        # word frequency in docs
        def doc_freq(word):
            c = 0
            try:
                c = DF[word]
            except:
                pass
            return c
        
        # idf vector generation
        def mean_of_vector(tokens):
            idf_vec = []
            for token in np.unique(tokens):
                df = doc_freq(token)
                idf = np.log(N/(df + 1))
                try:
                    idf_vec.append(idf)
                except:
                    pass
            
            return np.mean(idf_vec)
        
        # lower casing
        x_dataset["excerpt_proc"] = x_dataset["excerpt"].str.lower()
        # removal of punctuation
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_punctuation(text))
        # removal of stopwords
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_stopwords(text))
        # lemmatization 
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: lemmatize_words(text))    
        # tokenizetion
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda x: [token for token in word_tokenize(x)])
        
        N = len(x_dataset["excerpt"])
        DF = {}
        for i in range(N):
            tokens = x_dataset["excerpt_proc"].iloc[i]
            for w in tokens:
                try:
                    DF[w].add(i)
                except:
                    DF[w] = {i}
            

        for i in DF:
            DF[i] = len(DF[i]) 

        x_dataset['idf_vec'] = x_dataset["excerpt_proc"].apply(lambda x:  mean_of_vector(x))
        
        return x_dataset

In [10]:
# pre-processsing step
# Drop the columns 
from sklearn.compose import ColumnTransformer
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['id', 
                                                                        'url_legal', 
                                                                        'license',
                                                                        'excerpt',
                                                                        'standard_error',
                                                                        'ner_tags',
                                                                        'pos_tags',
                                                                        'excerpt_proc',
                                                                        'nlp_text'
                                                                       ])])

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
pipe = Pipeline([('get_new_columns', FeatureGenerator()),
#                  ('idf_vect', IDFVectorizer()),
#                  ('pre_processing',pre_process),
#                  ('random_forest', RandomForestRegressor(bootstrap = True, max_depth = 40, 
#                                                         max_features = 'auto', min_samples_split = 10,
#                                                         min_samples_leaf = 4, n_estimators = 800))
                ])

# fit the pipeline with the training data
pipe.fit(X, y)

Pipeline(steps=[('get_new_columns', FeatureGenerator())])

In [12]:
X = pipe.transform(X)

In [13]:
import dill as pickle

# # test
# with open('pickles/test.pk', 'wb') as output:
#     pickle.dump(test, output)

# X
with open('pickles/X.pk', 'wb') as output:
    pickle.dump(X, output)

# y
with open('pickles/y.pk', 'wb') as output:
    pickle.dump(y, output)
