In [11]:
import numpy as np
## for data
import pandas as pd
import collections
import json
import string 
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for text processing
import re
import nltk
from nltk.tokenize import word_tokenize
## for sentiment
from textblob import TextBlob
## for ner, pos
import spacy
nlp = spacy.load("en_core_web_lg")
nltk.download('wordnet')
## parameters searching
from sklearn.model_selection import GridSearchCV
## rmse
from sklearn.metrics import mean_squared_error
## pickle
import dill as pickle

from sklearn.base import BaseEstimator

from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [13]:
X = df.drop(['target', 'standard_error'], axis=1)
# X = train[['excerpt']]
y = df['target']

In [14]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [15]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class FeatureGenerator(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, x_dataset, y=None):
        return self

    def transform(self, x_dataset):
        
        # utils function to count the element of a list
        def utils_lst_count(lst):
            dic_counter = collections.Counter()
            for x in lst:
                dic_counter[x] += 1
            dic_counter = collections.OrderedDict(
                       sorted(dic_counter.items(),
                       key=lambda x: x[1], reverse=True))
            lst_count = [ {key:value} for key,value in dic_counter.items() ]
            return lst_count
        
        # utils function create new column for each tag category
        def utils_new_features(lst_dics_tuples, tag):
            if len(lst_dics_tuples) > 0:
                tag_type = []
                for dic_tuples in lst_dics_tuples:
                    for tuple in dic_tuples:
                        type, n = tuple[1], dic_tuples[tuple]
                        tag_type = tag_type + [type]*n
                        dic_counter = collections.Counter()
                        for x in tag_type:
                            dic_counter[x] += 1
                return dic_counter[tag]
            else:
                return 0

            
        # num of words in excerpt
        x_dataset['word_count'] = x_dataset["excerpt"].apply(lambda x: len(str(x).split(" ")))
        # num of chars in excerpt
        x_dataset['char_count'] = x_dataset["excerpt"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
        # num of sentences in excerpt
        x_dataset['sentence_count'] = x_dataset["excerpt"].apply(lambda x: len(str(x).split(".")))
        # avg word len in excerpt
        x_dataset['avg_word_length'] = x_dataset['char_count'] / x_dataset['word_count']
        # avg sentence len in excerpt
        x_dataset['avg_sentence_lenght'] = x_dataset['word_count'] / x_dataset['sentence_count']
        # sentiment index of excerpt
        x_dataset["sentiment"] = x_dataset["excerpt"].apply(lambda x: TextBlob(x).sentiment.polarity)
        x_dataset["nlp_text"] = x_dataset["excerpt"].apply(lambda x: nlp(x) )
        # ner tag text and exctract tags into a list
        x_dataset["ner_tags"] = x_dataset["nlp_text"].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in x.ents] )
        # count ner tags
        x_dataset["ner_tags"] = x_dataset["ner_tags"].apply(lambda x: utils_lst_count(x))
        # extract ner features
        ner_tags_set = ['QUANTITY', 'MONEY', 'GPE',
                    'NORP', 'CARDINAL', 'LOC',
                    'ORDINAL', 'PRODUCT', 'FAC',
                    'LANGUAGE', 'TIME', 'LAW',
                    'EVENT', 'ORG', 'PERCENT',
                    'WORK_OF_ART', 'PERSON', 'DATE']
        for feature in ner_tags_set:
            x_dataset["ner_tags_" + feature] = x_dataset["ner_tags"].apply(lambda x:
                                                                 utils_new_features(x, feature))
        
        # pos tag text and exctract tags into a list
        x_dataset["pos_tags"] = x_dataset["nlp_text"].apply(lambda x: [(token.text, token.tag_) 
                                for token in x] )
        # count pos tags
        x_dataset["pos_tags"] = x_dataset["pos_tags"].apply(lambda x: utils_lst_count(x))
        # extract pos features
        pos_tags_set = ['CC', 'POS', 'WDT', 'VBP', 'FW', ':', 'PRP$',
                    'WRB', 'PRP', 'RP', 'RBS', 'NNP', 'CD', 'EX', 'PDT',
                    'VBN', 'WP$', 'JJ', 'SYM', 'VBG', 'VB', 'JJS', 'VBD',
                    'WP', ',', 'NNS', 'NN', 'VBZ', 'MD', 'RB', 'DT',
                    'JJR', 'UH', 'NNPS', 'TO', 'RBR']
    
        for feature in pos_tags_set:
            x_dataset["pos_tags_" + feature] = x_dataset["pos_tags"].apply(lambda x:
                                                                 utils_new_features(x, feature))
    
        return x_dataset

In [16]:
# define the class IDFVectorizer
# to generate new feature with mean of idf
class IDFVectorizer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, x_dataset, y=None):
        return self

    def transform(self, x_dataset):
        
        # removal of punctuation
        PUNCT_TO_REMOVE = string.punctuation
        def remove_punctuation(text):
            return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
        
        # removal of stopwords
        from nltk.corpus import stopwords
        ", ".join(stopwords.words('english'))
        STOPWORDS = set(stopwords.words('english'))
        def remove_stopwords(text):
            return " ".join([word for word in str(text).split() if word not in STOPWORDS])
        
        # lemmatization 
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        
        # word frequency in docs
        def doc_freq(word):
            c = 0
            try:
                c = DF[word]
            except:
                pass
            return c
        
        # idf vector generation
        def mean_of_vector(tokens):
            idf_vec = []
            for token in np.unique(tokens):
                df = doc_freq(token)
                idf = np.log(N/(df + 1))
                try:
                    idf_vec.append(idf)
                except:
                    pass
            
            return np.mean(idf_vec)
        
        # lower casing
        x_dataset["excerpt_proc"] = x_dataset["excerpt"].str.lower()
        # removal of punctuation
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_punctuation(text))
        # removal of stopwords
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: remove_stopwords(text))
        # lemmatization 
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda text: lemmatize_words(text))    
        # tokenizetion
        x_dataset["excerpt_proc"] = x_dataset["excerpt_proc"].apply(lambda x: [token for token in word_tokenize(x)])
        
        N = len(x_dataset["excerpt"])
        DF = {}
        for i in range(N):
            tokens = x_dataset["excerpt_proc"].iloc[i]
            for w in tokens:
                try:
                    DF[w].add(i)
                except:
                    DF[w] = {i}
            

        for i in DF:
            DF[i] = len(DF[i]) 

        x_dataset['idf_vec'] = x_dataset["excerpt_proc"].apply(lambda x:  mean_of_vector(x))
        
        return x_dataset

In [17]:
# pre-processsing step
# Drop the columns 
from sklearn.compose import ColumnTransformer
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['id', 
                                                                        'url_legal', 
                                                                        'license',
                                                                        'excerpt',
                                                                        'ner_tags',
                                                                        'pos_tags',
                                                                        'excerpt_proc',
                                                                        'nlp_text'
                                                                       ])])

In [18]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([ #('get_new_columns', FeatureGenerator()),
                  ('idf_vect', IDFVectorizer()),
                  ('pre_processing', pre_process),
                 ('svr', SVR(C = 0.01, kernel = 'linear'))
                ])

In [19]:
preproc_pipe = Pipeline([ ('get_new_columns', FeatureGenerator())
                ])

In [20]:
X_proc = preproc_pipe.fit_transform(X)

In [22]:
n_splits = 5
n_repeats = 2
group_count = 10
cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
                           group_count = group_count, random_state = 0, strategy = 'quantile')


# logger.info("Train SVR")
i = 0
for train_index, test_index in cv.split(X_proc, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pipe.fit(X_train, y_train)
    predict = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, predict, squared=False)
    print(rmse)
#     logger.info("The rmse for SVR iteration {}: {:.3f}".format(i, rmse))
#     logger.info("-------------------------------")
    i += 1

0.7700154100760551
0.7694876468184112
0.7394585485116418
0.7457136256167464
0.7500215929438292
0.7459458130134728
0.7402906664982553
0.7705820872962144
0.7675792852875439
0.7481124396495366


In [23]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [24]:
test = preproc_pipe.transform(test)

In [25]:
predictions = pipe.predict(test)

In [26]:
submission = pd.DataFrame({'id':test['id'],'target':predictions})

In [27]:
submission.to_csv('submission.csv',index=False)

In [None]:
# C = [.01, 1]
# degree = [3, 4]
# gamma = ['scale']
# coef0 = [.01, 1]

# parameters = [
#   {'svr__C': C, 'svr__kernel':['linear']},
#   {'svr__C': C, 'svr__kernel':['poly'], 'svr__degree':degree, 'svr__gamma':gamma, 'svr__coef0': coef0},
#   {'svr__C': C, 'svr__kernel':['rbf'], 'svr__gamma':gamma},
#   {'svr__C': C, 'svr__kernel':['sigmoid'], 'svr__gamma':gamma, 'svr__coef0': coef0},
# ]

In [None]:
# grid = GridSearchCV(pipe, param_grid=parameters, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1, verbose=1)

In [None]:
# grid.fit(X, y)

In [None]:
# print("Best parameters: {}".format(grid.best_params_))

In [None]:
# # get importance
# importance = pipe.steps[3][1].feature_importances_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

In [None]:
# # n_estimators
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# # max_features
# max_features = ['auto', 'sqrt']

# # max_depth
# max_depth = [int(x) for x in np.linspace(20, 100, num = 5)]
# max_depth.append(None)

# # min_samples_split
# min_samples_split = [2, 5, 10]

# # min_samples_leaf
# min_samples_leaf = [1, 2, 4]

# # bootstrap
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'rrandom_forest__n_estimators': n_estimators,
#                'random_forest__max_features': max_features,
#                'random_forest__max_depth': max_depth,
#                'random_forest__min_samples_split': min_samples_split,
#                'random_forest__min_samples_leaf': min_samples_leaf,
#                'random_forest__bootstrap': bootstrap}

# random_search = RandomizedSearchCV(estimator=pipe,
#                                    param_distributions=random_grid,
#                                    n_iter=5,
#                                    scoring='neg_root_mean_squared_error',
#                                    cv=3, 
#                                    verbose=1, 
#                                    random_state=8)