<a href="https://colab.research.google.com/github/Yibei990826/author_identification/blob/main/a_basic_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [35]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [11]:
test_id = '14G3JoM_1Sa0wth9lZGpxtgThs5XIWGBk'
train_id = '1wImpqT63U1Hlip9PKtO0R_f-iAqzHqpq'

url_test = 'https://drive.google.com/uc?id={}'.format(test_id)
url_train = 'https://drive.google.com/uc?id={}'.format(train_id)

train = pd.read_csv(url_train)
test = pd.read_csv(url_test)

In [20]:
train.shape

(19579, 3)

In [22]:
train['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [24]:
# use labelEncoder to transform labels in to numerics
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [25]:
X = train['text'].values

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  stratify = y,
                                                  random_state=42,
                                                  test_size=0.1, shuffle=True)

## 1. Basic Model:


#### 1.1 Tf-idf + Logistic Regression

In [29]:
# Initiate the TFVectorizer and vectorize the text
vectorizer = TfidfVectorizer(max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

X_tf = vectorizer.fit_transform(X)



In [30]:
# Split the vectorized text into train and validation set
X_train_tf, X_val_tf, y_train, y_val = train_test_split(X_tf, y,
                                                  stratify = y,
                                                  random_state=42,
                                                  test_size=0.1, shuffle=True)

In [37]:
# Use logistic Regression model to distinguish
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tf, y_train)

predictions = clf.predict_proba(X_val_tf)
print ("logloss: %0.3f " % multiclass_logloss(y_val, predictions))

logloss: 0.768 


#### 1.2 Counter vectorizer

In [39]:
# Initiate the CountVectorizer and vectorize the text
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

X_tf = vectorizer.fit_transform(X)

# Split the vectorized text into train and validation set
X_train_tf, X_val_tf, y_train, y_val = train_test_split(X_tf, y,
                                                  stratify = y,
                                                  random_state=42,
                                                  test_size=0.1, shuffle=True)

In [40]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tf, y_train)

predictions = clf.predict_proba(X_val_tf)
print ("logloss: %0.3f " % multiclass_logloss(y_val, predictions))

logloss: 0.527 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 1.3 Naive Bayes classifier

In [41]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(X_train_tf, y_train)
predictions = clf.predict_proba(X_val_tf)

print ("logloss: %0.3f " % multiclass_logloss(y_val, predictions))

logloss: 0.485 


1.4 SVM Classifier

In [42]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(X_train_tf)
xtrain_svd = svd.transform(X_train_tf)
xvalid_svd = svd.transform(X_val_tf)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [44]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_val, predictions))

logloss: 0.783 


#### 1.5 XGB Classifier

In [45]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, y_train)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_val, predictions))

logloss: 0.792 


#### 1.6 Grid Search

In [46]:
# user make_scorer to d
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [47]:
# Create the pipeline
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()

clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10],
              'lr__penalty': ['l1', 'l2']}

In [50]:
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, cv=2)

# Fit Grid Search Model
model.fit(X_train_tf, y_train)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_s

Best score: -0.757
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l2'
	svd__n_components: 180
