In [38]:
import pandas as pd
import numpy as np
import math
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed

In [2]:
test_set = pd.read_csv('test.csv')
training_set = pd.read_csv('train.csv')

In [3]:
training_set.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
print(training_set[training_set.author == 'EAP'].shape)
print(training_set[training_set.author == 'HPL'].shape)
print(training_set[training_set.author == 'MWS'].shape)

(7900, 3)
(5635, 3)
(6044, 3)


In [5]:
df_train, df_val = train_test_split(training_set, stratify=training_set['author'], random_state=20, test_size=0.2, shuffle=True)

In [6]:
x_train = df_train['text']
y_train = df_train['author']
x_val = df_val['text']
y_val = df_val['author']

In [7]:
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)
y_val = label_enc.transform(y_val)

In [24]:
tfidf_vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_val_tfidf = tfidf_vectorizer.transform(x_val)

In [20]:
tfidf_vectorizer.vocabulary_

{'lollipop': 6531,
 'points': 8381,
 'celebrated': 1575,
 'magazine': 6703,
 'sustain': 10994,
 'evidently': 3752,
 'tremendous': 11579,
 'past': 8038,
 'comprehension': 2012,
 'points lollipop': 8382,
 'countenance': 2282,
 'brightened': 1290,
 'perceiving': 8115,
 'lips': 6452,
 'doubt': 3185,
 'murmured': 7314,
 'word': 12517,
 'stereotomy': 10667,
 'term': 11189,
 'applied': 475,
 'species': 10482,
 'pavement': 8069,
 'aeons': 187,
 'dwelt': 3336,
 'wandered': 12205,
 'gardens': 4552,
 'quaint': 8811,
 'pagodas': 7910,
 'peep': 8092,
 'pleasing': 8344,
 'clumps': 1855,
 'bushes': 1391,
 'white': 12370,
 'walks': 12196,
 'bordered': 1185,
 'delicate': 2695,
 'blossoms': 1122,
 'half': 4986,
 'frightened': 4456,
 'taken': 11080,
 'hand': 5007,
 'shudderingly': 10118,
 'withdrew': 12474,
 'strove': 10775,
 'collect': 1895,
 'oppodeldoc': 7785,
 'returned': 9309,
 'letter': 6349,
 'unopened': 11834,
 'alas': 269,
 'soon': 10396,
 'discovered': 3007,
 'employ': 3527,
 'year': 12618,
 't

In [21]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [29]:
logreg_model = LogisticRegression(C=7.0)

In [23]:
logreg_model.fit(x_train_tfidf, y_train)



LogisticRegression(C=7.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
logreg_model.score(x_val_tfidf, y_val)

0.8069458631256384

In [25]:
logreg_probs = logreg_model.predict_proba(x_val_tfidf)
multiclass_logloss(y_val, logreg_probs)

0.5117374604108654

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [8]:
ls = LancasterStemmer()
lem = WordNetLemmatizer()
stop_words = stopwords.words('english')

def normalize(text):
   words = word_tokenize(text) 
   words = [w for w in words if not w in stop_words]

   stemw = [ls.stem(w) for w in words]
   
   # 2- Lemmatization
   lemw = [lem.lemmatize(w) for w in stemw]
   return ' '.join(lemw)

def get_preprocessed_data(data):
    data = data.apply(normalize)
    return data

In [9]:
normalize(x_train[0])

'thi process , howev , afford mean ascertain dimend dungeon ; i might mak circuit , return point whent i set , without aw fact ; perfect uniform seem wal .'

In [27]:
def next_chunk(data):
    for i in range(4):
        yield data[math.ceil(i * len(data) / 4):math.ceil((i + 1) * len(data) / 4)]

x_train_preprocessed = np.hstack(Parallel(n_jobs=4, backend='loky', verbose=10)\
                            (delayed(get_preprocessed_data)(data) for data in next_chunk(x_train)))
%timeit x_train_preprocessed
# get_preprocessed_data(x_train[:math.ceil(len_x_train / 4)])

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.8s remaining:    1.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.0s finished


18.3 ns ± 0.0198 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [10]:
%timeit preprocessed = get_preprocessed_data(x_train)

6.83 s ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
x_val_preprocessed = get_preprocessed_data(x_val)

In [28]:
x_train_prep_tfidf = tfidf_vectorizer.fit_transform(x_train_preprocessed)
x_val_prep_tfidf = tfidf_vectorizer.transform(x_val_preprocessed)

In [36]:
logreg_model = LogisticRegression(C=11.0)
logreg_model.fit(x_train_prep_tfidf, y_train)

LogisticRegression(C=11.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
logreg_model.score(x_val_prep_tfidf, y_val)

0.7913687436159347

## Random Forest

In [66]:
rfmodel = RandomForestClassifier(n_estimators=100, random_state=1, min_samples_leaf=3, n_jobs=1)

In [67]:
rfmodel.fit(x_train_prep_tfidf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [69]:
rfmodel.score(x_val_prep_tfidf, y_val)

0.7017364657814096

In [None]:
(Parallel(n_jobs=4, backend='loky', verbose=10)\
                            (delayed()(data) for data in next_chunk(x_train))

## Neural networks