# SureStart Day 5: Action Item

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


In [2]:
# Reading the data

import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json'))
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

# Data Exploration

In [3]:
# Create dataframe
df = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines=True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
df.shape

(26709, 3)

In [5]:
df['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [6]:
# Import `train_test_split` from `sklearn.model_selection`
from sklearn.model_selection import train_test_split

# Import TfidfVectorizer: transforms text to feature vectors that can be used as input to estimator
from sklearn.feature_extraction.text import TfidfVectorizer

# Specify the data 
vectorizer = TfidfVectorizer(max_features=1000, use_idf=False)
headlines = [i['headline'] for i in data]

# Create x variable
X = vectorizer.fit_transform(headlines).toarray()

# Create y variable (target labels)
y=df['is_sarcastic'].values

# Split the data up in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Build a Sequential Model

In [7]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential

# Import `Dense` from `keras.layers`
from keras.layers import Dense

# Initialize the constructor
model = Sequential()

# Add an input layer 
model.add(Dense(32, activation='relu', input_shape=(1000,)))

# Add one hidden layer 
model.add(Dense(4, activation='relu'))

# Add an output layer 
model.add(Dense(1, activation='sigmoid'))

# Compile & Fit

In [8]:
# Compile the model 

# with adam optimizer and the binary_crossentropy loss function
# can monitor the accuracy during the training by passing ['accuracy'] to the metrics argument

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Model Summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                32032     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 32,169
Trainable params: 32,169
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Fit the model

model.fit(X_train, y_train,epochs=25, batch_size=300, verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f832c549350>

# **Evaluate Model**

In [10]:
# Predict the labels (test the model)
y_pred = model.predict(X_test)

# Evaluate the model
score = model.evaluate(X_test, y_test,verbose=1)

# score is a list that holds the combination of the loss and the accuracy
print(score)

[0.4342542886734009, 0.8073754906654358]


In [11]:
# Import the modules for evaluation metrics from `sklearn.metrics`
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Confusion matrix
confusion_matrix(y_test, y_pred.round())

array([[2481,  515],
       [ 514, 1832]])

In [12]:
# Precision: a measure of a classifier’s exactness; the higher the precision, the more accurate the classifier
precision = precision_score(y_test, y_pred.round())

# Recall: a measure of a classifier’s completeness; the higher the recall, the more cases the classifier covers
recall = recall_score(y_test, y_pred.round())

# F1 score: a weighted average of precision and recall
f1_score = f1_score(y_test,y_pred.round())

# Print all values
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.7805709416276098
Recall: 0.7809036658141517
F1 Score: 0.7807372682718944
