### Problema

Vamos a identificar el autor (Edgar Allan Poe, Mary Shelley, and HP Lovecraft) de un texto a partir de carateristicas derivadas del texto. 

In [2]:
import numpy as np 
import pandas as pd 
import re
from nltk.corpus import stopwords
from nltk import download
from typing import List
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/train.csv')

df.dropna(axis=0)
df.set_index('id', inplace = True)

df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jose.osorio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
### yet, upon, must, thing, man

In [5]:
stopWords = stopwords.words('english')
new_stop_words = ["yet", "upon", "must", "thing", "man"]
stopWords = stopWords + new_stop_words


In [6]:


def make_stopwords(custom_stopwords:List[str]) -> set:
    stopWords = stopwords.words('english')
    stopWords = set(stopWords + custom_stopwords)
    return stopWords
def procesamiento_base(df:pd.DataFrame, custom_stopwords:List[str]) -> pd.DataFrame:
    stopWords = make_stopwords(custom_stopwords)
    df['texto_procesado'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    df['len_texto'] = df['texto_procesado'].apply(lambda x: len(x))
    df['palabras'] = df['texto_procesado'].apply(lambda x: len(x.split(' ')))
    df['palabras_sin_stopwords'] = df['texto_procesado'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    df['len_avg_palabra'] = df['texto_procesado'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    df['num_comas'] = df['text'].apply(lambda x: x.count(','))

    return(df)
custom_stopwords = ["yet", "upon", "must", "thing", "man"]
df = procesamiento_base(df, custom_stopwords)

df.head()

Unnamed: 0_level_0,text,author,texto_procesado,len_texto,palabras,palabras_sin_stopwords,len_avg_palabra,num_comas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2


In [7]:
MODELING_COLS = [c for c in df.columns.values if c  not in ['id','author']]
MODELING_FEATURES= [c for c in df.columns.values if c  not in ['id','text','author','texto_procesado']]
target = 'author'


In [8]:
MODELING_COLS

['text',
 'texto_procesado',
 'len_texto',
 'palabras',
 'palabras_sin_stopwords',
 'len_avg_palabra',
 'num_comas']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[MODELING_COLS], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,text,texto_procesado,len_texto,palabras,palabras_sin_stopwords,len_avg_palabra,num_comas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id19417,"This panorama is indeed glorious, and I should...",this panorama is indeed glorious and i should ...,91,18,6,6.666667,1
id09522,"There was a simple, natural earnestness about ...",there was a simple natural earnestness about h...,240,44,18,6.277778,4
id22732,"Who are you, pray, that I, Duc De L'Omelette, ...",who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9
id10351,He had gone in the carriage to the nearest tow...,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0
id24580,"There is no method in their proceedings, beyon...",there is no method in their proceedings beyond...,71,13,5,7.0,1


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns: list):
        if not isinstance(columns, list):
            self.columns = [columns]
        else:
            self.columns = columns

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X[self.columns]
        return X


class ClipTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self, q = None):
      self.q_1 = None
      self.q_2 = None
      self.q = q

    def fit(self, X, y=None):
        if self.q is None: 
          self.q_1, self.q_2  = X.quantile(q = [0.1, 0.9])
        else: 
          self.q_1, self.q_2  = X.quantile(q = self.q)
        return self

    def transform(self, X):
        Xclip = np.array(np.clip(X, self.q_1, self.q_2)).reshape(-1, 1)
        return Xclip

class SingleSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from custom_pipeline_steps.custom_steps import SingleSelector, ClipTransformer, ColumnSelector

text = Pipeline([
                ('selector', SingleSelector(key='texto_procesado')),
                ('tfidf', TfidfVectorizer( stop_words='english', ngram_range=(1,2)))
            ])

numerics = Pipeline([
        ('selector', ColumnSelector(columns=['len_avg_palabra', 'num_comas'])),
        ('scaler', StandardScaler())])

num_len = Pipeline([
        ('selector', SingleSelector(key="len_texto")),
        ("clipper", ClipTransformer(q=[0.05, 0.95])),
        ('scaler', StandardScaler())
])

features = FeatureUnion([("text_tdfi", text), ("numeric", numerics), ("len_cli", num_len)])

features.fit_transform(X_train)

<13117x150268 sparse matrix of type '<class 'numpy.float64'>'
	with 325163 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',features),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test)

0.6960693283813061

In [14]:
from joblib import dump, load
import joblib
with open("model/pipeline.pkl", "wb") as m:
    joblib.dump(pipeline, m)

In [15]:
with open("model/pipeline.pkl", "rb") as m:
    pipeline_pred = joblib.load(m)

In [16]:
pipeline_pred.predict(X_test)

array(['EAP', 'EAP', 'EAP', ..., 'MWS', 'EAP', 'EAP'], dtype=object)