In [1]:
import pandas as pd
from sklearn import preprocessing, model_selection
from sklearn.linear_model import LogisticRegression
import numpy as np

In [2]:
train_pd = pd.read_csv("./train.csv")
test_pd = pd.read_csv("./test.csv")
sample_pd = pd.read_csv("./sample_submission.csv")

In [3]:
sample_pd.head(5)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


Predict the author's probabilities

In [4]:
train_pd.head(5)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


Text ids, the text and related authors are given in the training set

In [5]:
test_pd.head(5)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


Given the text, predict one of the authors. It is a multiclass label prediction.

Encode the labels to integers

In [6]:
label_enc = preprocessing.LabelEncoder()
y = label_enc.fit_transform(train_pd.author)

Split training data into train and validation

In [7]:
# x is the text, and y are the labels
xtrain, xvalid, ytrain, yvalid = model_selection.train_test_split(train_pd.text.values, y,
                                                stratify = y,
                                                random_state = 42,
                                                test_size=0.1, shuffle=True)

In [8]:
print(xtrain.shape, ytrain.shape)
print(xvalid.shape, yvalid.shape)

(17621,) (17621,)
(1958,) (1958,)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(xtrain)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
print(vectorizer.vocabulary_ )



In [11]:
print(vectorizer.idf_)

[10.08375622 10.08375622  9.67829111 ... 10.08375622 10.08375622
 10.08375622]


In [12]:
vector = vectorizer.transform([xtrain[0]])

In [13]:
print(vector.shape)

(1, 23979)


In [14]:
print(vector.toarray)

<bound method _cs_matrix.toarray of <1x23979 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>>


TFIDF

In [15]:
tfidfvector = TfidfVectorizer(min_df=3, max_features=None,
                            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                            ngram_range=(1,3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                             stop_words='english')

In [16]:
tfidfvector.fit(list(xtrain) + list(xvalid))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [17]:
xtrain_tfidfvector = tfidfvector.transform(xtrain)
xvalid_tfidfvector = tfidfvector.transform(xvalid)

Fit a regression model

In [18]:
lr = LogisticRegression(C=1.0)
lr.fit(xtrain_tfidfvector, ytrain)
predictions = lr.predict_proba(xvalid_tfidfvector)

In [19]:
#Calculate log loss 
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [20]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.626 


Word Counts

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
countvector = CountVectorizer(analyzer = 'word', token_pattern=r'\w{1,}',
                            ngram_range=(1,3), stop_words= 'english')

In [25]:
#fit the data
countvector.fit(list(xtrain)+list(xvalid))
xtrain_countvector = countvector.transform(xtrain)
xvalid_countvector = countvector.transform(xvalid)

In [26]:
#fit Logistic regression
lr_cv = LogisticRegression(C=1.0)
lr_cv.fit(xtrain_countvector, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
predictions_cv = lr_cv.predict_proba(xvalid_countvector)

In [29]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_cv))

logloss: 0.528 


In [31]:
#Fitting Naive Bayes on tfidf
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(xtrain_tfidfvector, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
predictions_nb_tfidf = nb.predict_proba(xvalid_tfidfvector)

In [33]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_nb_tfidf))

logloss: 0.578 


In [34]:
#Naive bayes on counts
nb.fit(xtrain_countvector, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
predictions_nb_cv = nb.predict_proba(xvalid_countvector)

In [36]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_nb_cv))

logloss: 0.485 


SVD

In [39]:
#Reduce dimensionality before applying SVM. And standardise the data for SVD
#SVM on TFIDF data
from sklearn.decomposition import TruncatedSVD


In [47]:
svd = TruncatedSVD(n_components=120)
svd.fit(xtrain_tfidfvector)
xtrain_svd_tf = svd.transform(xtrain_tfidfvector)
xvalid_svd_tf = svd.transform(xvalid_tfidfvector)

In [48]:
#Standardise the data
scl = preprocessing.StandardScaler()

In [49]:
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd_tf)
xvalid_svd_scl = scl.transform(xvalid_svd_tf)

In [50]:
from sklearn.svm import SVC
svd_model = SVC(C=1.0, probability=True)

In [53]:
svd_model.fit(xtrain_svd_scl, ytrain)
predictions_svm_tfidf = svd_modeldeldeldelmodel.predict_proba(xvalid_svd_scl)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_svm_tfidf))

logloss: 0.725 
