In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
import pandas as pd
import numpy as np

#tqdm for progress bars
from tqdm import tqdm

import xgboost as xgb
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection,metrics,pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

#keras/TF library
from keras.models import Sequential
from keras.layers.recurrent import LSTM,GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D,Conv1D,MaxPooling1D,Flatten,Bidirectional,SpatialDropout1D,Embedding
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

#nltk library
from nltk import word_tokenize
from nltk.corpus import stopwords

#ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [12]:
train  = pd.read_csv("../input/spooky-authors-csv/train.csv")
test   = pd.read_csv("../input/spooky-authors-csv/test.csv")
sample = pd.read_csv("../input/spooky-authors-csv/sample_submission.csv")

In [13]:
print("shape of training data:", train.shape)
print("shape of testing data:", test.shape)

In [14]:
train.head()

In [15]:
test.head()

In [16]:
sample.head()

Note: There are three classes (authors). This is text classification problem into three classes. Kaggle mentions that the submissions are evaluated using multi-class logarithmic loss. Therefore, for each id, we need to predict the probability for each authors. 

labelencode the author column using LabelEncoder from scikit-learn

In [17]:
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(train["author"].values)
print(y[0:5])

train test split of the data using train_test_split function of scikit which has parameter test_size which decides fraction of the values to use as test data

In [18]:
# we will use 10% of data for testing
X_train, X_test, y_train, y_test = train_test_split(train.text.values,y,random_state=42,
                                                    test_size=0.1,shuffle=True)

In [19]:
# we can also pass countvectorizer parameters in TfidVectorizer
tfv = TfidfVectorizer(min_df=3,max_features=None,strip_accents='unicode',analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1,3),use_idf=1,smooth_idf=1,stop_words='english')

# max_features confines maximum number of words 

tfv.fit(list(X_train) + list(X_test))
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)

In [20]:
print(X_train_tfv)

In [21]:
def multiclass_logloss(actual,predicted,eps=1e-15):
    
    #converting the 'actual' values to binary values if it's 
    #not binary values
    
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0],predicted.shape[1]))
        
        for i, val in enumerate(actual):
            actual2[i,val] = 1
        actual = actual2
    
    #clip function truncates the number between
    #a max number and min number
    clip = np.clip(predicted,eps,1-eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0/ rows * vsota 

In [22]:
# Fitting Logistic Regression on TFIDF
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv,y_train)
prediction = clf.predict_proba(X_test_tfv)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

In [23]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

ctv.fit(list(X_train)+list(X_test))
X_train_ctv = ctv.transform(X_train)
X_test_ctv = ctv.transform(X_test)

In [24]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train_ctv,y_train)
prediction = clf.predict_proba(X_test_ctv)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

# Navie Bayes

In [25]:
# tfidf
clf = MultinomialNB()
clf.fit(X_train_tfv,y_train)

prediction = clf.predict_proba(X_test_tfv)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

In [26]:
# counts
clf = MultinomialNB()
clf.fit(X_train_ctv,y_train)

prediction = clf.predict_proba(X_test_ctv)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

# SVM

SVM is very slow algorithm so it takes lot of time to fit so we will
use Singular Value Decomposition before applying SVM
and we will also standardize the data.

In [27]:
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(X_train_tfv)
X_train_svd = svd.transform(X_train_tfv)
X_test_svd = svd.transform(X_test_tfv)

scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)

X_train_svd_scl = scl.transform(X_train_svd)
X_test_svd_scl = scl.transform(X_test_svd)

In [28]:
svm = SVC(C=1.0,probability=True)

svm.fit(X_train_svd_scl,y_train)
prediction = svm.predict_proba(X_test_svd_scl)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

# Xgboost

In [29]:
# on TF-IDF
clf = xgb.XGBClassifier(max_depth=7,n_estimators=200,colsample_bytree=0.8,subsample=0.8,nthread=10,learning_rate=0.1)

clf.fit(X_train_tfv.tocsc(),y_train)
prediction = clf.predict_proba(X_test_tfv.tocsc())

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

In [30]:
# on SVD
clf = xgb.XGBClassifier(max_depth=7,n_estimators=200,colsample_bytree=0.8,subsample=0.8,nthread=10,learning_rate=0.1)

clf.fit(X_train_svd,y_train)
prediction = clf.predict_proba(X_test_svd)

print("logloss: %0.3f" % multiclass_logloss(y_test,prediction))

# Hyperparameter Optimization

In [31]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False,needs_proba=True)

svd = decomposition.TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()

clf = pipeline.Pipeline([('svd',svd),
                         ('scl',scl),
                         ('lr',lr_model)])

In [32]:
params_grid = {'svd__n_components':[120,180],
               'lr__C':[0.1,1.0,10],
               'lr__penalty':['l1','l2']}

In [33]:
model = GridSearchCV(estimator=clf,param_grid=params_grid,scoring=mll_scorer,
                     verbose=10,n_jobs=-1,refit=True,cv=2)

#fitting the model
model.fit(X_train_tfv,y_train)

print('Best score: %0.3f' % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(params_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [34]:
# Grid search for NB
nb = MultinomialNB()

clf = pipeline.Pipeline([('nb',nb)])

params_grid = {'nb__alpha':[0.001,0.01,0.1,1,10,100]}

model  = GridSearchCV(estimator=clf,param_grid=params_grid,scoring=mll_scorer,verbose=10,n_jobs=-1,refit=True,cv=2)

model.fit(X_train_tfv,y_train)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(params_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Word2Vec

In [35]:
#gensim library allow us to access pre trained embeddings
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

In [36]:
#download the dataset and return it as object
model_twitter_glove = api.load("glove-twitter-25") #here 25 is dimenssion of the data

In [37]:
#we need to word, vec dictionary before fitting it to models
embedding_index = {}

all_words = list(model_twitter_glove.key_to_index.keys())
#words in gensim model is stored as "key":vector object pair

for word in all_words:
    embedding_index[word] = model_twitter_glove.get_vector(word)

print('Total words in embeddings %d' % len(embedding_index))

In [38]:
#getting stop words from nltk library
stop_words = stopwords.words('english')

def sen2vec(s):
    # lowe the letters, tokenize them , remove stop_words, remove numbers
    words = str(s).lower()
    words = word_tokenize(s)
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        #try because word might not present in index.
        try:
            M.append(embedding_index[w])
        except:
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    
    if type(v) != np.ndarray:
        #25 because that is dimension of out word embedding
        return np.zeros(25)
    
    return v/np.sqrt((v** 2).sum())

In [39]:
#converting every sentence to word embedding
X_train_glove = [sen2vec(s) for s in tqdm(X_train)]
X_test_glove = [sen2vec(s) for s in tqdm(X_test)]

In [40]:
X_train_glove = np.array(X_train_glove)
X_test_glove = np.array(X_test_glove)

# XGBoost on GloVe

In [41]:
clf = xgb.XGBClassifier(n_estimators=200,nthread=10,silent=False)
clf.fit(X_train_glove, y_train)

predictions = clf.predict_proba(X_test_glove)

print ("logloss: %0.3f " % multiclass_logloss(y_test, predictions))

# Simple NN

In [42]:
# Remember to scale the data before feeding to the NN
scl = preprocessing.StandardScaler()

X_train_glove_scl = scl.fit_transform(X_train_glove)
X_test_glove_scl = scl.transform(X_test_glove)

y_train_enc = np_utils.to_categorical(y_train)
y_test_enc = np_utils.to_categorical(y_test)

In [43]:
model = Sequential()

model.add(Dense(25,input_dim=25,activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(25,input_dim=25,activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam')

In [44]:
# let's train just for 5 epochs
model.fit(X_train_glove_scl,y=y_train_enc,batch_size=50,epochs=5,verbose=1,
          validation_data=(X_test_glove_scl,y_test_enc))

# Simple RNN

In [45]:
# remember to totenize the words before feeding to RNN
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(X_train)+list(X_test))
X_train_sec = token.texts_to_sequences(X_train)
X_test_sec = token.texts_to_sequences(X_test)

X_train_pad = sequence.pad_sequences(X_train_sec,maxlen=max_len)
X_test_pad = sequence.pad_sequences(X_test_sec,maxlen=max_len)

word_index = token.word_index

In [46]:
model = Sequential()
# we are not using pretrainde embedding yet.
model.add(Embedding(len(word_index)+1,25,input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(3))
model.compile(loss='categorical_crossentropy',optimizer='adam')


In [47]:
model = Sequential()
# we are not using pretrainde embedding yet.
model.add(Embedding(len(word_index)+1,25,input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(3))
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [48]:
model.fit(X_train_pad,y=y_train_enc,epochs=5,batch_size=100,validation_data=(X_test_pad,y_test_enc))

In [49]:
embedding_matrix = np.zeros((len(word_index)+1,25)) #25 because we have word vector of dim 25

for word,i in word_index.items():
    #we use get() so it returns None if word is not found
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [50]:
model = Sequential()
# we are not using pretrainde embedding yet.
model.add(Embedding(len(word_index)+1,25,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))
#as the weight are predefined trainable is False
model.add(SimpleRNN(100))
model.add(Dropout(0.2))

model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [51]:
model.fit(X_train_pad,y=y_train_enc,epochs=5,batch_size=100,validation_data=(X_test_pad,y_test_enc))

# LSTM

In [52]:
model = Sequential()

model.add(Embedding(len(word_index)+1,25,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))

model.add(SpatialDropout1D(0.3))
model.add(LSTM(100,dropout=0.3,recurrent_dropout=0.3))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [53]:
model.fit(X_train_pad,y=y_train_enc,batch_size=100,epochs=10,verbose=1,
          validation_data=(X_test_pad,y_test_enc))

# GRU

In [54]:
model = Sequential()

model.add(Embedding(len(word_index)+1,25,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))

model.add(SpatialDropout1D(0.3))
model.add(GRU(100))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [55]:
model.fit(X_train_pad,y=y_train_enc,batch_size=100,epochs=5,verbose=1,
          validation_data=(X_test_pad,y_test_enc))

# Bidirectional LSTM

In [56]:
model = Sequential()

model.add(Embedding(len(word_index)+1,25,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))

model.add(Bidirectional(LSTM(25, dropout=0.3, recurrent_dropout=0.3)))
    
model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [57]:
model.fit(X_train_pad,y=y_train_enc,batch_size=100,epochs=5,verbose=1,
          validation_data=(X_test_pad,y_test_enc))