In [1]:
import pandas as pd
import re

In [2]:
train = pd.read_csv("raw_data\\fulltrain.csv", header=None)
test = pd.read_csv("raw_data\\balancedtest.csv", header=None)

In [3]:
train

Unnamed: 0,0,1
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...
...,...,...
48849,4,The ruling Kuomintang (KMT) has claimed owners...
48850,4,The Taipei city government has encouraged the ...
48851,4,President Ma Ying-jeou said Friday that a park...
48852,4,The families of the four people who were kille...


In [4]:
test

Unnamed: 0,0,1
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...
...,...,...
2995,4,The Air Force mistakenly gave rival companies ...
2996,4,The United Nations climate chief on Friday cha...
2997,4,River Plate midfielder Diego Buonanotte has un...
2998,4,Lawmakers were on the brink Tuesday of exempti...


## 1. Obtain the basic features

In [5]:
#Change columns name
train.columns = ['Verdict','Text']

In [6]:
train

Unnamed: 0,Verdict,Text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...
...,...,...
48849,4,The ruling Kuomintang (KMT) has claimed owners...
48850,4,The Taipei city government has encouraged the ...
48851,4,President Ma Ying-jeou said Friday that a park...
48852,4,The families of the four people who were kille...


In [7]:
test

Unnamed: 0,0,1
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...
...,...,...
2995,4,The Air Force mistakenly gave rival companies ...
2996,4,The United Nations climate chief on Friday cha...
2997,4,River Plate midfielder Diego Buonanotte has un...
2998,4,Lawmakers were on the brink Tuesday of exempti...


In [8]:
#Check the first 10 lines
test.head(10)

Unnamed: 0,0,1
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...
5,1,With the first Presidential debate just two da...
6,1,"There are fans, and then there are super-fans...."
7,1,"With its landmark decisions this week, the Uni..."
8,1,Koch Industries is defending its acquisition o...
9,1,Republican lawmakers asked increasingly tough ...


In [9]:
#Change columns name
test.columns = ['Verdict','Text']

In [10]:
#Word count
train['word_count'] = train['Text'].apply(lambda x: len(str(x).split(" ")))
train[['Text', 'word_count']].head()

Unnamed: 0,Text,word_count
0,"A little less than a decade ago, hockey fans w...",147
1,The writers of the HBO series The Sopranos too...,123
2,Despite claims from the TV news outlet to offe...,706
3,After receiving 'subpar' service and experienc...,706
4,After watching his beloved Seattle Mariners pr...,174


In [11]:
#Char count
train['char_count'] = train['Text'].str.len()
train[['Text','char_count']].head()

Unnamed: 0,Text,char_count
0,"A little less than a decade ago, hockey fans w...",873
1,The writers of the HBO series The Sopranos too...,715
2,Despite claims from the TV news outlet to offe...,4443
3,After receiving 'subpar' service and experienc...,3913
4,After watching his beloved Seattle Mariners pr...,1058


In [12]:
#Average word length

def avg_word(sentence):
    words=sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['Text'].apply(lambda x:avg_word(x))
train[['Text','avg_word']].head()

Unnamed: 0,Text,avg_word
0,"A little less than a decade ago, hockey fans w...",4.979452
1,The writers of the HBO series The Sopranos too...,4.860656
2,Despite claims from the TV news outlet to offe...,5.302128
3,After receiving 'subpar' service and experienc...,4.550355
4,After watching his beloved Seattle Mariners pr...,5.115607


In [13]:
# The number of stop words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop=stopwords.words('english')
train['stopwords']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x in stop]))
train[['Text','stopwords']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,stopwords
0,"A little less than a decade ago, hockey fans w...",46
1,The writers of the HBO series The Sopranos too...,43
2,Despite claims from the TV news outlet to offe...,219
3,After receiving 'subpar' service and experienc...,299
4,After watching his beloved Seattle Mariners pr...,59


In [14]:
#The number of special chars
train['hashtags']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.startswith("#")]))
train[['Text','hashtags']].head()

Unnamed: 0,Text,hashtags
0,"A little less than a decade ago, hockey fans w...",0
1,The writers of the HBO series The Sopranos too...,0
2,Despite claims from the TV news outlet to offe...,0
3,After receiving 'subpar' service and experienc...,0
4,After watching his beloved Seattle Mariners pr...,0


In [15]:
#The number of numerics
train['numerics']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.isdigit()]))
train[['Text','numerics']].head()

Unnamed: 0,Text,numerics
0,"A little less than a decade ago, hockey fans w...",0
1,The writers of the HBO series The Sopranos too...,1
2,Despite claims from the TV news outlet to offe...,20
3,After receiving 'subpar' service and experienc...,5
4,After watching his beloved Seattle Mariners pr...,0


In [16]:
#The number of upper vocab
train['upper']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.isupper()]))
train[['Text','upper']].head()

Unnamed: 0,Text,upper
0,"A little less than a decade ago, hockey fans w...",4
1,The writers of the HBO series The Sopranos too...,2
2,Despite claims from the TV news outlet to offe...,9
3,After receiving 'subpar' service and experienc...,13
4,After watching his beloved Seattle Mariners pr...,3


In [17]:
train.head()

Unnamed: 0,Verdict,Text,word_count,char_count,avg_word,stopwords,hashtags,numerics,upper
0,1,"A little less than a decade ago, hockey fans w...",147,873,4.979452,46,0,0,4
1,1,The writers of the HBO series The Sopranos too...,123,715,4.860656,43,0,1,2
2,1,Despite claims from the TV news outlet to offe...,706,4443,5.302128,219,0,20,9
3,1,After receiving 'subpar' service and experienc...,706,3913,4.550355,299,0,5,13
4,1,After watching his beloved Seattle Mariners pr...,174,1058,5.115607,59,0,0,3


## 2. Preprocessing for Text

In [18]:
#Delete HTML
from bs4 import BeautifulSoup
train['Text'] = train['Text'].apply(lambda x: BeautifulSoup(x,'html.parser').get_text())
test['Text'] = test['Text'].apply(lambda x: BeautifulSoup(x,'html.parser').get_text())
test['Text'].head()

0    When so many actors seem content to churn out ...
1     In what football insiders are calling an unex...
2    In a freak accident following Game 3 of the N....
3    North Koreas official news agency announced to...
4    The former Alaska Governor Sarah Palin would b...
Name: Text, dtype: object

In [19]:
#Remove emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
train['Text']=train['Text'].apply(lambda x: remove_emoji(x))
test['Text']=test['Text'].apply(lambda x: remove_emoji(x))

In [20]:
#Transform to lower letter
train['Text'] = train['Text'].apply(lambda x: x.lower())
test['Text'] = test['Text'].apply(lambda x: x.lower())
test['Text'].head()

0    when so many actors seem content to churn out ...
1     in what football insiders are calling an unex...
2    in a freak accident following game 3 of the n....
3    north koreas official news agency announced to...
4    the former alaska governor sarah palin would b...
Name: Text, dtype: object

In [21]:
#Remove punctuation
import re
train['Text'] = train['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['Text'] = test['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['Text'].head()

0    when so many actors seem content to churn out ...
1     in what football insiders are calling an unex...
2    in a freak accident following game 3 of the nb...
3    north koreas official news agency announced to...
4    the former alaska governor sarah palin would b...
Name: Text, dtype: object

In [22]:
#Substitute number
import inflect
def to_digit(digit):
    i = inflect.engine()
    if digit.isdigit():
        output = i.number_to_words(digit)
    else:
        output = digit
    return output
train['Text'] = train['Text'].apply(lambda x: to_digit(x))
test['Text'] = test['Text'].apply(lambda x: to_digit(x))
test['Text'].head()

0    when so many actors seem content to churn out ...
1     in what football insiders are calling an unex...
2    in a freak accident following game 3 of the nb...
3    north koreas official news agency announced to...
4    the former alaska governor sarah palin would b...
Name: Text, dtype: object

In [23]:
#Remove the stopwords
from nltk.corpus import stopwords
stop=stopwords.words('english')
train['Text']=train['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
test['Text']=test['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
test['Text'].head()

0    many actors seem content churn performances qu...
1    football insiders calling unexpectedly severe ...
2    freak accident following game 3 nba finals cle...
3    north koreas official news agency announced to...
4    former alaska governor sarah palin would bring...
Name: Text, dtype: object

In [24]:
#Remove the frequency words
freq=pd.Series(' '.join(train['Text']).split()).value_counts()[:10]
freq=list(freq.index)
train['Text']=train['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))
test['Text']=test['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))
test['Text'].head()

0    many actors seem content churn performances qu...
1    football insiders calling unexpectedly severe ...
2    freak accident following game 3 nba finals cle...
3    north koreas official news agency announced to...
4    former alaska governor sarah palin bring muchn...
Name: Text, dtype: object

In [25]:
# Remove the scarce word
freq = pd.Series(' '.join(train['Text']).split()).value_counts()[-10:]
freq = list(freq.index)
train['Text'] = train['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['Text'] = test['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['Text'].head()

0    many actors seem content churn performances qu...
1    football insiders calling unexpectedly severe ...
2    freak accident following game 3 nba finals cle...
3    north koreas official news agency announced to...
4    former alaska governor sarah palin bring muchn...
Name: Text, dtype: object

In [26]:
# #Correct the spelling errors
# from textblob import TextBlob
# train['Text'].apply(lambda x: str(TextBlob(x).correct()))
# test['Text'].apply(lambda x: str(TextBlob(x).correct()))
# test['Text'].head()

In [27]:
#Noise Removal
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
      for (k, v) in rule.items():
        regex = re.compile(k)
        text = regex.sub(v, text)
      text = text.rstrip()
    return text.lower()

train['Text']=train['Text'].apply(lambda x: text_cleaner(x))

In [28]:
#Stemming
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
train['Text']=train['Text'].apply(lambda x:" ".join(ps.stem(word) for word in x.split()))
test['Text']=test['Text'].apply(lambda x:" ".join(ps.stem(word) for word in x.split()))
test['Text'].head()

0    mani actor seem content churn perform quick pa...
1    footbal insid call unexpectedli sever punish n...
2    freak accid follow game 3 nba final cleveland ...
3    north korea offici news agenc announc today mi...
4    former alaska governor sarah palin bring muchn...
Name: Text, dtype: object

In [29]:
#Lemmatization
from textblob import Word
import nltk
nltk.download('wordnet')
train['Text']=train['Text'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))
test['Text']=test['Text'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Danny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
train['Text']

0        littl le decad ago hockey fan bless slate game...
1        writer hbo seri soprano took anoth dare storyt...
2        despit claim tv news outlet offer nonstop news...
3        receiv subpar servic experienc unusu long wait...
4        watch belov seattl marin prevail san diego pad...
                               ...                        
48849    rule kuomintang kmt claim ownership slush fund...
48850    taipei citi encourag rebuild lowtomidris resid...
48851    presid yingjeou friday park built commemor jap...
48852    famili four kill landslid nation freeway 3 las...
48853    ministri financ make public saturday name big ...
Name: Text, Length: 48854, dtype: object

# LSTM

### Word2Vec

In [31]:
from gensim.models import Word2Vec, KeyedVectors

In [33]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(list(train['Text']), list(train['Verdict']), test_size=0.2)

In [36]:
X_valid

['accommod viewer nonstop demand season hardcor porn spice network roll pumpkin spice channel offer 24hour autumnthem pornograph film video world dirtiest slut seen take everi hole theyr appl pick northern massachusett fall foliag tour guid turn hot milf who readi parti spice publicist glenn fitzhugh monday press event proud give fan servic launch friday night soontob autumn porn classic pumpkin snatch cider indian cornhol youll love halloween spootakular featur ron jeremi frankenstein monster cock fitzhugh ad earli subscrib receiv free access payperview skanksgiv broadcast gobblegobbl 2 stuf',
 'economist believ sign could head recess job open last month findlayarea bob evan prompt delug 3 million job applic outofwork american restaur manag tom field confirm tuesday within three day place help want sign bob evan front entranc field reportedli receiv 800000 resum parttim hostess job newli avail posit offer health benefit minimumwag pay dress code mandat standard redandwhit bob evan ker

In [37]:
SIZE=20

# Dimension reduce using Word2Vec from k dimensions where
# k is number of unique words in corpus to size=30.
model = Word2Vec([x.split(" ") for x in X_train], size=SIZE, window=20, min_count=1, workers=30) # Train model.

# only need to save this
encoder = model.wv

# memory here can be freed
model = None

In [38]:
def process_article(text, encoding):
    l = [encoding[i] if i in encoding else np.zeros(SIZE) for i in text.split()]
    return np.vstack(l)

In [39]:
X_train = [process_article(x, encoder) for x in X_train]
X_valid = [process_article(x, encoder) for x in X_valid]

In [40]:
class ArticleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]).float(), self.y[idx] - 1

In [41]:
train_dataset = ArticleDataset(X_train, y_train)

In [42]:
# to make actual batches we have to pad or truncate data somehow.
BATCHSIZE = 1
train_loader = DataLoader(ArticleDataset(X_train, y_train), batch_size=BATCHSIZE, shuffle=True)
val_loader = DataLoader(ArticleDataset(X_valid, y_valid), batch_size=BATCHSIZE)

In [410]:
# 100 = embedding dimension specified in word2vec

In [43]:
class LSTMModel(torch.nn.Module):
    def __init__(self, n_hidden):
        super().__init__()
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(SIZE, n_hidden, batch_first=True)
        self.fc = nn.Linear(n_hidden, 4)
        
        self.n_layers = 1
        self.n_hidden = n_hidden
        self.hidden = self.init_hidden(BATCHSIZE)
    
    def forward(self, x):
        '''
        Input here should be already preprocessed into encoded form.
        '''    
        self.hidden = model.init_hidden(1)
        x = self.dropout(x)
        o, self.hidden = self.lstm(x, self.hidden)
        out = self.fc(self.hidden[-1])
        return out
    
    def init_hidden(self, batch_size):
        '''
        Initialize hidden state.
        Create two new tensors with sizes n_layers x batch_size x n_hidden,
        initialized to zero, for hidden state and cell state of LSTM
        Arguments:
            batch_size: batch size, an integer
        Returns:
            hidden: hidden state initialized
        '''
        weight = next(self.parameters()).data
        if train_on_gpu:
            hidden = (weight.new(batch_size, self.n_layers, self.n_hidden).zero_().cuda(),
                      weight.new(batch_size, self.n_layers, self.n_hidden).zero_().cuda())

        else:
            hidden = (weight.new(batch_size, self.n_layers, self.n_hidden).zero_(),
                      weight.new(batch_size, self.n_layers, self.n_hidden).zero_())

        return hidden

In [44]:
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    for item in params:
        print(f'{item:>8}')
    print(f'________\n{sum(params):>8}')

In [45]:
def trainmodel(model, epochs=1, lr=0.001):
    model.cuda()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_loader:
            y_pred = model(x.cuda())[0]
            optimizer.zero_grad()
            loss = criterion(y_pred, y.cuda())
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()
            total += 1
        val_loss, val_acc, val_rmse, x = validation_metrics(model, val_loader)
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

In [46]:
def validation_metrics(model, val_loader):
    model.eval()
    model.cuda()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    preds = []
    
    for x, y in val_loader:
        y_pred = model(x.cuda())[0]
        y_pred = y_pred.cpu()
        loss = criterion(y_pred, y)
        pred = torch.max(y_pred, 1)[1]
        preds.append(pred)
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total, preds

In [57]:
train_on_gpu = True
model = LSTMModel(20)
count_parameters(model)

    1600
    1600
      80
      80
      80
       4
________
    3444


In [58]:
criterion = nn.CrossEntropyLoss()
trainmodel(model)

train loss 0.413, val loss 0.273, val accuracy 0.909, and val rmse 0.192


### Test

In [59]:
test['Encoded'] = test['Text'].apply(lambda x: process_article(x, encoder))

In [60]:
test_loader = DataLoader(ArticleDataset(test['Encoded'], test['Verdict']), batch_size=BATCHSIZE)

In [61]:
a, b, c, d = validation_metrics(model, test_loader)

In [62]:
d = [x.item() + 1 for x in d]

In [63]:
test['Verdict']

0       1
1       1
2       1
3       1
4       1
       ..
2995    4
2996    4
2997    4
2998    4
2999    4
Name: Verdict, Length: 3000, dtype: int64

In [64]:
from sklearn.metrics import precision_recall_fscore_support

In [65]:
score2 = precision_recall_fscore_support(test['Verdict'], d, average='macro')

In [66]:
score2

(0.6719366130464278, 0.673, 0.6686305300467905, None)

## 3. TF-IDF + Logistic Regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train['Text']))
xtrain_tfv =  tfv.transform(train['Text']) 
# xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(test['Text'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
#Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(xtrain_tfv, train['Verdict'])
predictions = clf.predict(xtest_tfv)

In [None]:
score = f1_score(test['Verdict'], predictions, average='macro')
print (score)

0.7271046886192852


In [None]:
xtrain_tfv.shape

(48854, 1064211)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
score2 = precision_recall_fscore_support(test['Verdict'], predictions, average='macro')
print(score2)

(0.7491595685381429, 0.738, 0.7271046886192852, None)


In [None]:
score3 = precision_recall_fscore_support(test['Verdict'], predictions, average='micro')
print(score3)

(0.738, 0.738, 0.738, None)


In [None]:
score4 = precision_recall_fscore_support(test['Verdict'], predictions, average='weighted')
print(score4)

(0.7491595685381429, 0.738, 0.7271046886192853, None)
