In [32]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('train.csv')

df.dropna(axis=0)
df.set_index('id', inplace = True)

df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [33]:
import re
import nltk
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))


#creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the number of commas
    df['commas'] = df['text'].apply(lambda x: x.count(','))
    #tokenize
    df['tokens'] = df['text'].apply(lambda x: nltk.word_tokenize(x))
    #add pos from nltk pos tagging library, where classfiers have already 
    #been taught to identify the part of speech for each word
    df['pos'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    #get the number of adjectives using lambda function that gets part of speech
    #added to text by pos_tag then makes a list of all the words that have the JJ identifier
    #that denotes adjective -> the length gives us how many items there are in the list
    df['adjectives'] = df['pos'].apply(lambda x: len([word for word, pos in x if pos.startswith('JJ')]))
    #get number of nouns 
    df['nouns'] = df['pos'].apply(lambda x: len([word for word, pos in x if pos.startswith('NN')]))
    #get number of verbs
    df['verbs'] = df['pos'].apply(lambda x: len([word for word, pos in x if pos.startswith('VB')]))
    return(df)

df = processing(df)

df.head()

Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas,tokens,pos,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4,"[This, process, ,, however, ,, afforded, me, n...","[(This, DT), (process, NN), (,, ,), (however, ...",2,10,6
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0,"[It, never, once, occurred, to, me, that, the,...","[(It, PRP), (never, RB), (once, RB), (occurred...",1,2,2
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4,"[In, his, left, hand, was, a, gold, snuff, box...","[(In, IN), (his, PRP$), (left, JJ), (hand, NN)...",5,10,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3,"[How, lovely, is, spring, As, we, looked, from...","[(How, WRB), (lovely, RB), (is, VBZ), (spring,...",6,10,5
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2,"[Finding, nothing, else, ,, not, even, gold, ,...","[(Finding, VBG), (nothing, NN), (else, RB), (,...",1,7,5


In [34]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c  not in ['id','text','author']]
numeric_features= [c for c in df.columns.values if c  not in ['id','text','author','processed']]
target = 'author'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,tokens,pos,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1,"[This, panorama, is, indeed, glorious, ,, and,...","[(This, DT), (panorama, NN), (is, VBZ), (indee...",1,3,2
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4,"[There, was, a, simple, ,, natural, earnestnes...","[(There, EX), (was, VBD), (a, DT), (simple, NN...",5,10,7
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9,"[Who, are, you, ,, pray, ,, that, I, ,, Duc, D...","[(Who, WP), (are, VBP), (you, PRP), (,, ,), (p...",2,21,11
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0,"[He, had, gone, in, the, carriage, to, the, ne...","[(He, PRP), (had, VBD), (gone, VBN), (in, IN),...",1,8,3
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1,"[There, is, no, method, in, their, proceedings...","[(There, EX), (is, VBZ), (no, DT), (method, NN...",0,4,1


In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

<13117x21516 sparse matrix of type '<class 'numpy.float64'>'
	with 148061 stored elements in Compressed Sparse Row format>

In [37]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-0.50769254],
       [ 0.88000324],
       [ 2.24907223],
       ...,
       [-0.46112557],
       [-0.14447015],
       [-0.39593181]])

In [38]:
#here are the features
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])
adjectives =  Pipeline([
                ('selector', NumberSelector(key='adjectives')),
                ('standard', StandardScaler()),
            ])
nouns =  Pipeline([
                ('selector', NumberSelector(key='nouns')),
                ('standard', StandardScaler()),
            ])
verbs =  Pipeline([
                ('selector', NumberSelector(key='verbs')),
                ('standard', StandardScaler()),
            ])

In [39]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('adjectives', adjectives),
                      ('nouns', nouns),
                      ('verbs', verbs)
                     ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<13117x21524 sparse matrix of type '<class 'numpy.float64'>'
	with 252997 stored elements in Compressed Sparse Row format>

In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
    #sets classifier to LR instead of Forest
])

scores = cross_val_score(pipeline, df[features], df[target], cv=2)

print('Cross-validation scores:', scores)


# 2 fold data: [0.77088866 0.77740321]

# 10 fold data: [0.79213483 0.81358529 0.80081716 0.79213483 0.82686415 0.79009193
# 0.79928498 0.81001021 0.81001021 0.8109351 ]

# 20 fold data: [0.79162411 0.77630235 0.80898876 0.81511747 0.79366701 0.81001021
# 0.79979571 0.79570991 0.80898876 0.82635342 0.7854954  0.79775281
# 0.80796731 0.7834525  0.80694586 0.79877426 0.79673136 0.82737487
# 0.81307457 0.80777096]

Cross-validation scores: [0.77088866 0.77740321]


In [41]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text', 'features__length', 'features__words', 'features__words_not_stopword', 'features__avg_word_length', 'features__commas', 'features__adjectives', 'features__nouns', 'features__verbs', 'features__text__memory', 'features__text__steps', 'features__text__verbose', 'features__text__selector', 'features__text__tfidf', 'features__text__selector__key', 'features__text__tfidf__analyzer', 'features__text__tfidf__binary', 'features__text__tfidf__decode_error', 'features__text__tfidf__dtype', 'features__text__tfidf__encoding', 'features__text__tfidf__input', 'features__text__tfidf__lowercase', 'features__text__tfidf__max_df', 'features__text__tfidf__max_features', 'features__text__tfidf__min_df', 'features__text__tfidf__ngram_range', 'features__text__tfidf__norm', 'features__text__tfidf__preprocessor', 'features_

In [42]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                    #'classifier__max_depth': [50, 70],
                  #'classifier__min_samples_leaf': [1,2]
                  }
clf = GridSearchCV(pipeline, hyperparameters, cv=8)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text',
                                                                        Pipeline(steps=[('selector',
                                                                                         TextSelector(key='processed')),
                                                                                        ('tfidf',
                                                                                         TfidfVectorizer(stop_words='english'))])),
                                                                       ('length',
                                                                        Pipeline(steps=[('selector',
                                                                                         NumberSelector(key='length')),
                                                                                        ('stan

In [43]:
clf.best_params_

{'features__text__tfidf__max_df': 0.9,
 'features__text__tfidf__ngram_range': (1, 1)}

In [44]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

         EAP       0.74      0.84      0.79      2587
         HPL       0.81      0.75      0.77      1852
         MWS       0.82      0.74      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.79      0.77      0.78      6462
weighted avg       0.79      0.78      0.78      6462



In [45]:
submission = pd.read_csv('test.csv')

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.255812,0.075432,0.668755
id24541,0.858864,0.036704,0.104432
id00134,0.241647,0.686072,0.072282
id27757,0.676866,0.205296,0.117838
id04081,0.689335,0.227963,0.082702


In [30]:
#we definetly have a lot more stored elements since we are now holding data for three more
#features, I definetly thought this would reduce the preformance as they take up more memory
#and are three new classes to store. I think the precision increased for the model which is good
#so I guess when it comes to the addition of new features there will be a trade off when
#it comes to preformance vs precision, the preformance with less features is definetly
#faster so it comes down to what we hope to gain from the classifier.