In [1]:
import numpy as np
## for data
import pandas as pd
import collections
import json
import string 
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for text processing
import re
import nltk
from nltk.tokenize import word_tokenize
## for sentiment
from textblob import TextBlob
## for ner, pos
import spacy
nlp = spacy.load("en_core_web_lg")
nltk.download('wordnet')
## parameters searching
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
## rmse
from sklearn.metrics import mean_squared_error
## pickle
import dill as pickle

from sklearn.base import BaseEstimator

from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annazhukovets/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import os
import logging
# a function  to create and save logs in the log files
def log(path, file):
    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [obj] -- [logger that record logs]
    """

    # check if the file exist
    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(levelname)s %(message)s"
    file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    # configure logger
    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    # create a file handler for output file
    handler = logging.FileHandler(log_file)

    # set the logging level for log file
    handler.setLevel(logging.INFO)
    
    # create a logging format
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)

    # add the handlers to the logger
    logger.addHandler(handler)

    return logger

In [3]:
# X
with open('pickles/X.pk', 'rb') as data:
    X = pickle.load(data)

# y
with open('pickles/y.pk', 'rb') as data:
    y = pickle.load(data)

In [4]:
logger = log(path="logs/", file="logs.csv")

# logger.info("Train GBR ('max_depth': 4, 'n_estimators': 150)")

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [6]:
# define the class IDFVectorizer
# to generate new feature with mean of idf
class IDFVectorizer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, x_dataset, y=None):
        return self

    def transform(self, x_dataset):
        
        # removal of punctuation
        PUNCT_TO_REMOVE = string.punctuation
        def remove_punctuation(text):
            return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
        
        # removal of stopwords
        from nltk.corpus import stopwords
        ", ".join(stopwords.words('english'))
        STOPWORDS = set(stopwords.words('english'))
        def remove_stopwords(text):
            return " ".join([word for word in str(text).split() if word not in STOPWORDS])
        
        # lemmatization 
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        
        # word frequency in docs
        def doc_freq(word):
            c = 0
            try:
                c = DF[word]
            except:
                pass
            return c
        
        # idf vector generation
        def mean_of_vector(tokens):
            idf_vec = []
            for token in np.unique(tokens):
                df = doc_freq(token)
                idf = np.log(N/(df + 1))
                try:
                    idf_vec.append(idf)
                except:
                    pass
            
            return np.mean(idf_vec)
        
        # lower casing
        x_dataset["excerpt_proc"] = x_dataset["excerpt"].str.lower()
        # removal of punctuation
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_punctuation(text))
        # removal of stopwords
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_stopwords(text))
        # lemmatization 
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: lemmatize_words(text))    
        # tokenizetion
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda x: [token for token in word_tokenize(x)])
        
        N = len(x_dataset["excerpt"])
        DF = {}
        for i in range(N):
            tokens = x_dataset["excerpt_proc"].iloc[i]
            for w in tokens:
                try:
                    DF[w].add(i)
                except:
                    DF[w] = {i}
            

        for i in DF:
            DF[i] = len(DF[i]) 

        x_dataset['idf_vec'] = x_dataset["excerpt_proc"].apply(lambda x:  mean_of_vector(x))
        
        return x_dataset

In [7]:
# pre-processsing step
# Drop the columns 
from sklearn.compose import ColumnTransformer
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['id', 
                                                                        'url_legal', 
                                                                        'license',
                                                                        'excerpt',
                                                                        'standard_error',
                                                                        'ner_tags',
                                                                        'pos_tags',
                                                                        'excerpt_proc',
                                                                        'nlp_text'
                                                                       ])])

In [8]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('idf_vect', IDFVectorizer()),
                 ('pre_processing',pre_process),
                 ('gbr', GradientBoostingRegressor(max_depth = 2, n_estimators = 200))
                ])

In [9]:
n_splits = 5
n_repeats = 2
group_count = 10
cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
                           group_count = group_count, random_state = 0, strategy = 'quantile')


logger.info("Train GBR w/o params")
i = 0
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pipe.fit(X_train, y_train)
    predict = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, predict, squared=False)
    logger.info("The rmse for GBR iteration {}: {:.3f}".format(i, rmse))
    logger.info("-------------------------------")
    i += 1

INFO Train GBR w/o params
INFO The rmse for GBR iteration 0: 0.863
INFO -------------------------------
INFO The rmse for GBR iteration 1: 0.862
INFO -------------------------------
INFO The rmse for GBR iteration 2: 0.822
INFO -------------------------------
INFO The rmse for GBR iteration 3: 0.806
INFO -------------------------------
INFO The rmse for GBR iteration 4: 0.841
INFO -------------------------------
INFO The rmse for GBR iteration 5: 0.825
INFO -------------------------------
INFO The rmse for GBR iteration 6: 0.840
INFO -------------------------------
INFO The rmse for GBR iteration 7: 0.848
INFO -------------------------------
INFO The rmse for GBR iteration 8: 0.859
INFO -------------------------------
INFO The rmse for GBR iteration 9: 0.855
INFO -------------------------------


In [10]:
# n_estimators = [50, 100, 150, 200]
# max_depth = [2, 4, 6, 8]

In [11]:
# param_grid = {'gbr__n_estimators': n_estimators,
# 'gbr__max_depth': max_depth}

In [12]:
# grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=3,verbose=1)

In [13]:
# grid.fit(X, y)

In [14]:
# print("Best parameters: {}".format(grid.best_params_))

In [15]:
# # get importance
# importance = pipe.steps[3][1].feature_importances_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

In [16]:
# # n_estimators
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# # max_features
# max_features = ['auto', 'sqrt']

# # max_depth
# max_depth = [int(x) for x in np.linspace(20, 100, num = 5)]
# max_depth.append(None)

# # min_samples_split
# min_samples_split = [2, 5, 10]

# # min_samples_leaf
# min_samples_leaf = [1, 2, 4]

# # bootstrap
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'rrandom_forest__n_estimators': n_estimators,
#                'random_forest__max_features': max_features,
#                'random_forest__max_depth': max_depth,
#                'random_forest__min_samples_split': min_samples_split,
#                'random_forest__min_samples_leaf': min_samples_leaf,
#                'random_forest__bootstrap': bootstrap}

# random_search = RandomizedSearchCV(estimator=pipe,
#                                    param_distributions=random_grid,
#                                    n_iter=5,
#                                    scoring='neg_root_mean_squared_error',
#                                    cv=3, 
#                                    verbose=1, 
#                                    random_state=8)