# Spooky Author Identification

## 1) Importing the libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import nltk
import spacy
import re

## 2) Reading Input files

In [0]:
df_train = pd.read_csv('drive/My Drive/Pytorch_DataSet/Spooky Authors/train.csv')
df_test = pd.read_csv('drive/My Drive/Pytorch_DataSet/Spooky Authors/test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [5]:
df_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392 entries, 0 to 8391
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      8392 non-null   object
 1   text    8392 non-null   object
dtypes: object(2)
memory usage: 131.2+ KB


In [7]:
df_train.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id25820,"When any separation took place between us, it ...",EAP
freq,1,1,7900


In [8]:
df_test.describe()

Unnamed: 0,id,text
count,8392,8392
unique,8392,8392
top,id15469,Years added to the strictness of their union.
freq,1,1


## 3) Data Cleaning

For this part,<br>
- Removing leading and trailing white spaces.
- Removing any non text character.
- Lowercase all the words.
- Removing punctuation marks.
- Removing stop words.


In [9]:
re.sub('[^a-zA-Z]',' ','123')

'   '

In [10]:
s = "string. Wit'h. Punctuation?"
re.sub(r'[^\w\s]','',s)

'string With Punctuation'

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [0]:
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))

In [0]:
# lets write function for it.

def text_cleaning(text):
  text = re.sub(r'[^a-zA-Z]',' ',text) # removing non-character text
  text = re.sub(r'[^\w\s]',' ',text)    # removing punctuation marks
  text = text.strip()                  # removing leading and trailing white spaces
  text = text.lower()                  # converting everything to lowercase
  word_tokens = word_tokenize(text)  
  sentence = [w for w in word_tokens if not w in stop_words] 
  return sentence

In [0]:
# For train file

df_train['text'] = df_train['text'].apply(lambda x : text_cleaning(x))

# For test file

df_test['text'] = df_test['text'].apply(lambda x : text_cleaning(x))

In [21]:
print(df_train['text'][0])

['process', 'however', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', 'might', 'make', 'circuit', 'return', 'point', 'whence', 'set', 'without', 'aware', 'fact', 'perfectly', 'uniform', 'seemed', 'wall']


Now, we will perform<br>
- As text is given, so at last creating a dictionary for text.

In [0]:
# print(df_train['text'].head())

In [0]:
# print(df_test['text'].head())

In [0]:
txt = []
for t in df_train['text']:
  a = set(t)
  txt.append(a)

for t in df_test['text']:
  a = set(t)
  txt.append(a)


In [27]:
print(len(txt))

27971
