In [1]:
import re    
import nltk  
import string
import warnings
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape)
print(test.shape)
print(train.columns)
print(test.columns)

(24209, 4)
(2500, 3)
Index(['Unnamed: 0', 'article_link', 'headline', 'is_sarcastic'], dtype='object')
Index(['Unnamed: 0', 'article_link', 'headline'], dtype='object')


In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,article_link,headline,is_sarcastic
0,0,https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5,former versace store clerk sues over secret 'black code' for minority shoppers,0
1,1,https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365,"the 'roseanne' revival catches up to our thorny political mood, for better and worse",0
2,2,https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697,mom starting to fear son's web series closest thing she will have to grandchild,1
3,3,https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1
4,4,https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15e64fdcb,j.k. rowling wishes snape happy birthday in the most magical way,0


In [4]:
train["is_sarcastic"].value_counts()

0    13542
1    10667
Name: is_sarcastic, dtype: int64

# Data Cleaning

In [5]:
combi = train.append(test, ignore_index=True)
print(combi.shape)
print(combi.columns)

(26709, 4)
Index(['Unnamed: 0', 'article_link', 'headline', 'is_sarcastic'], dtype='object')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
combi = combi.drop(['Unnamed: 0', 'article_link'], axis = 1)
print(combi.shape)
combi.head()

(26709, 2)


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'black code' for minority shoppers,0.0
1,"the 'roseanne' revival catches up to our thorny political mood, for better and worse",0.0
2,mom starting to fear son's web series closest thing she will have to grandchild,1.0
3,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1.0
4,j.k. rowling wishes snape happy birthday in the most magical way,0.0


In [7]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [8]:
combi['tidy_headline'] = np.vectorize(remove_pattern)(combi['headline'], "@[\w]*") 

In [9]:
combi['tidy_headline'] = combi['tidy_headline'].str.replace("[^a-zA-Z#]", " ")

In [10]:
# removing short words (length < 2)
combi['tidy_headline'] = combi['tidy_headline'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [11]:
tokenized_headline = combi['tidy_headline'].apply(lambda x: x.split()) # tokenizing

In [12]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_headline = tokenized_headline.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

In [13]:
for i in range(len(tokenized_headline)):
    tokenized_headline[i] = ' '.join(tokenized_headline[i])
    
combi['tidy_headline'] = tokenized_headline

In [14]:
all_words = ' '.join([text for text in combi['tidy_headline']])
all_words

'former versac store clerk sue over secret black code for minor shopper the roseann reviv catch our thorni polit mood for better and wors mom start fear son web seri closest thing she will have grandchild boehner just want wife listen not come with altern debt reduct idea rowl wish snape happi birthday the most magic way advanc the world women the fascin case for eat lab grown meat thi ceo will send your kid school you work for hi compani top snake handler leav sink huckabe campaign friday morn email insid trump presser for the age airlin passeng tackl man who rush cockpit bomb threat facebook reportedli work healthcar featur and app north korea prais trump and urg voter reject dull hillari actual cnn jeffrey lord ha been indefens for while barcelona hold huge protest support refuge nuclear bomb deton dure rehears for spider man music cosbi lawyer ask whi accus didn come forward smear legal team year ago stock analyst confus frighten boar market bloomberg program build better citi just

# BoWords

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(combi['tidy_headline'])
bow.shape

(26709, 1000)

 #  W2V

In [16]:
tokenized_headline = combi['tidy_headline'].apply(lambda x: x.split()) # tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_headline,
            size=200, # desired no. of features/independent variables 
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34)

model_w2v.train(tokenized_headline, total_examples= len(combi['tidy_headline']), epochs=20)

4098648

In [17]:
model_w2v['shoot']

array([-0.12511332,  0.11777612,  0.09796016, -0.27913436, -0.98057705,
       -0.28638926,  0.1771073 , -0.42630872,  0.4621546 ,  0.06623623,
        0.07683232,  0.14618652,  0.14486255, -0.23125805,  0.06555056,
       -0.14593574, -0.5995805 , -0.36877313,  0.26268363,  0.45103145,
        0.24778166, -0.36614   , -0.00461038,  0.28543863, -0.0723221 ,
        0.07358675, -0.08821044, -0.0132716 , -0.11278228,  0.6290427 ,
        0.25734285,  0.00849721, -0.33464864, -0.4048819 ,  0.40205607,
       -0.20684934,  0.22297747, -0.10944162,  0.3296759 ,  0.42900002,
        0.2580317 ,  0.6556395 ,  0.5220587 ,  0.13242653,  0.43496183,
       -0.6908441 , -0.22461087,  0.4631681 ,  0.22002508, -0.9605873 ,
       -0.42160895, -0.7366161 ,  0.8458495 , -0.46645343,  0.3477492 ,
       -0.4563509 , -0.06616057,  0.5776974 ,  0.1465769 , -0.4386832 ,
        0.38673794,  0.269511  , -0.12971172,  0.16256593,  0.10626994,
       -0.31109202, -0.04613613, -0.35796127, -0.2540674 ,  0.17

In [18]:
len(model_w2v['shoot']) 

200

In [19]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
                         
            continue
    if count != 0:
        vec /= count
    return vec

In [20]:
wordvec_arrays = np.zeros((len(tokenized_headline), 200))

for i in range(len(tokenized_headline)):
    wordvec_arrays[i,:] = word_vector(tokenized_headline[i], 200)
    
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(26709, 200)

 # Model

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [22]:
# Extracting train and test BoW features
train_bow = bow[:24209,:]
test_bow = bow[24209:,:]
print(train_bow.shape)

train.shape

(24209, 1000)


(24209, 4)

In [23]:
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['is_sarcastic'],  
                                                          random_state=42, 
                                                          test_size=0.3)

In [24]:
lreg = LogisticRegression()

lreg.fit(xtrain_bow, ytrain)

prediction = lreg.predict_proba(xvalid_bow) 
prediction_int = prediction[:,1] >= 0.3 
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int) 

0.7218355246197473

# XGBoost

In [26]:
import xgboost as xgb
train_w2v = wordvec_df.iloc[:24209,:]
test_w2v = wordvec_df.iloc[24209:,:]

xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]

In [27]:
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3).fit(xtrain_w2v, ytrain)

prediction = xgb.predict(xvalid_w2v)
f1_score(yvalid, prediction)

0.7919886453240813