# Spooky Author Identification

## 1) Importing the libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import nltk
import spacy
import re

## 2) Reading Input files

In [0]:
df_train = pd.read_csv('drive/My Drive/Pytorch_DataSet/Spooky Authors/train.csv')
df_test = pd.read_csv('drive/My Drive/Pytorch_DataSet/Spooky Authors/test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [5]:
df_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392 entries, 0 to 8391
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      8392 non-null   object
 1   text    8392 non-null   object
dtypes: object(2)
memory usage: 131.2+ KB


In [7]:
df_train.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id02045,"I spoke at length, and perseveringly of my dev...",EAP
freq,1,1,7900


In [8]:
df_test.describe()

Unnamed: 0,id,text
count,8392,8392
unique,8392,8392
top,id11038,In the fourth niche the statue was veiled; it ...
freq,1,1


## 3) Data Cleaning

For this part,<br>
- Removing leading and trailing white spaces.
- Removing any non text character.
- Lowercase all the words.
- Removing punctuation marks.
- Removing stop words.


In [9]:
re.sub('[^a-zA-Z]',' ','123')

'   '

In [10]:
s = "string. Wit'h. Punctuation?"
re.sub(r'[^\w\s]','',s)

'string With Punctuation'

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [0]:
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))

In [0]:
# lets write function for it.

def text_cleaning(text):
  text = re.sub(r'[^a-zA-Z]',' ',text) # removing non-character text
  text = re.sub(r'[^\w\s]',' ',text)    # removing punctuation marks
  text = text.strip()                  # removing leading and trailing white spaces
  text = text.lower()                  # converting everything to lowercase
  return text

In [0]:
# For train file

df_train['text'] = df_train['text'].apply(lambda x : text_cleaning(x))

# For test file

df_test['text'] = df_test['text'].apply(lambda x : text_cleaning(x))

In [17]:
print(df_train['text'][0])

this process  however  afforded me no means of ascertaining the dimensions of my dungeon  as i might make its circuit  and return to the point whence i set out  without being aware of the fact  so perfectly uniform seemed the wall


Now, we will perform<br>
- As text is given, so at last creating a dictionary for text.

In [0]:
# print(df_train['text'].head())

In [0]:
# print(df_test['text'].head())

In [0]:
txt = []
for t in df_train['text']:
  a = set(t)
  txt.append(a)

for t in df_test['text']:
  a = set(t)
  txt.append(a)


In [21]:
print(len(txt))

27971


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
X = df_train['text'].values
X_test = df_test['text'].values

In [0]:
vectorizer = TfidfVectorizer(stop_words='english')  # removing stop words as well

In [25]:
# Example

corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']
a = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['document', 'second']


In [0]:
X = vectorizer.fit_transform(X)
X_test = vectorizer.transform(X_test)

In [27]:
print(len(vectorizer.get_feature_names()))

24748


In [28]:
print(X.shape)

(19579, 24748)


In [29]:
print(X_test.shape)

(8392, 24748)


In [0]:
# for y now
author_name = {'EAP':0,'HPL':1,'MWS':2}
y = (df_train['author'].map(author_name)).values 

In [31]:
print(y.shape, y[:5])

(19579,) [0 1 0 2 1]


In [0]:
# Coverting into test val and train set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1)


In [33]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(17621, 24748) (1958, 24748) (17621,) (1958,)


## 4) Building and Applying the model

### 4.1) Lets try for XGBoost

In [0]:
import xgboost as xgb

In [0]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data = X_val,label = y_val)
dtest = xgb.DMatrix(data=X_test)

In [0]:
eval_set = [(X_val,y_val)]

In [0]:
params = {'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss'}

In [0]:
model = xgb.XGBClassifier(objective='multi:softmax',
                          num_class=3,
                          eval_metric='mlogloss',
                          max_depth=6,
                          n_estimators=1000,
                          learning_rate=0.2,
                          early_stopping_rounds=10,
                          n_jobs=-1
                          )

In [58]:
model.fit(X_train,y_train,eval_metric='mlogloss',eval_set=eval_set,verbose=True)

[0]	validation_0-mlogloss:1.07636
[1]	validation_0-mlogloss:1.06044
[2]	validation_0-mlogloss:1.04637
[3]	validation_0-mlogloss:1.03635
[4]	validation_0-mlogloss:1.02639
[5]	validation_0-mlogloss:1.01734
[6]	validation_0-mlogloss:1.00927
[7]	validation_0-mlogloss:1.00263
[8]	validation_0-mlogloss:0.99424
[9]	validation_0-mlogloss:0.988788
[10]	validation_0-mlogloss:0.982884
[11]	validation_0-mlogloss:0.9768
[12]	validation_0-mlogloss:0.972137
[13]	validation_0-mlogloss:0.967347
[14]	validation_0-mlogloss:0.962746
[15]	validation_0-mlogloss:0.958141
[16]	validation_0-mlogloss:0.953641
[17]	validation_0-mlogloss:0.949955
[18]	validation_0-mlogloss:0.946121
[19]	validation_0-mlogloss:0.942175
[20]	validation_0-mlogloss:0.938495
[21]	validation_0-mlogloss:0.934953
[22]	validation_0-mlogloss:0.931957
[23]	validation_0-mlogloss:0.928853
[24]	validation_0-mlogloss:0.925378
[25]	validation_0-mlogloss:0.922971
[26]	validation_0-mlogloss:0.919854
[27]	validation_0-mlogloss:0.916832
[28]	validati

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=10,
              eval_metric='mlogloss', gamma=0, learning_rate=0.2,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
              n_estimators=1000, n_jobs=-1, nthread=None, num_class=3,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [0]:
y_preds = model.predict_proba(X_test)

In [60]:
y_preds[:10]

array([[0.12332056, 0.14466359, 0.73201585],
       [0.8331536 , 0.13159879, 0.03524762],
       [0.39926383, 0.5744908 , 0.02624541],
       [0.65514195, 0.2472678 , 0.09759019],
       [0.8221283 , 0.11761667, 0.06025506],
       [0.53745675, 0.42011887, 0.04242436],
       [0.77236515, 0.17145441, 0.05618045],
       [0.01549937, 0.32452118, 0.65997946],
       [0.8111171 , 0.13894998, 0.04993295],
       [0.7314521 , 0.11580572, 0.15274216]], dtype=float32)

In [66]:
y_preds[0:10,0]

array([0.12332056, 0.8331536 , 0.39926383, 0.65514195, 0.8221283 ,
       0.53745675, 0.77236515, 0.01549937, 0.8111171 , 0.7314521 ],
      dtype=float32)

In [0]:
submission_file = pd.read_csv('drive/My Drive/Pytorch_DataSet/Spooky Authors/sample_submission.csv')

In [62]:
submission_file.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [0]:
submission_file['EAP'] = y_preds[:,0]
submission_file['HPL'] = y_preds[:,1]
submission_file['MWS'] = y_preds[:,2]

In [68]:
submission_file.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.123321,0.144664,0.732016
1,id24541,0.833154,0.131599,0.035248
2,id00134,0.399264,0.574491,0.026245
3,id27757,0.655142,0.247268,0.09759
4,id04081,0.822128,0.117617,0.060255


In [0]:
submission_file.to_csv('spooky_authors.csv',index=False)