# Clickbait Text Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump

In [2]:
df = pd.read_csv('clickbait_data.csv')
df

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [3]:
df.isnull().sum()

headline     0
clickbait    0
dtype: int64

In [4]:
df['clickbait'].unique()

array([1, 0])

In [5]:
df['clickbait'].value_counts()

0    16001
1    15999
Name: clickbait, dtype: int64

In [6]:
X = df['headline']
y = df['clickbait']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [8]:
X_train

6253               Are You More Usher Or Justin Timberlake
19511      Shares in Blackstone Group fall below IPO price
231      The New Trailer For "Orange Is The New Black" ...
21722    Knicks Rout Grizzles, Extending Win Streak to ...
29901    New Zealand general election: National, Labour...
                               ...                        
24900    Samples being taken from poultry farms in East...
22857              California wildfires continue to spread
20524    Canadian court clears Stephen Truscott of 1959...
6285     13 Facts That Will Make You Too Aware Of Your ...
28368             Fourth 21 July bomb suspect held in Rome
Name: headline, Length: 21440, dtype: object

In [9]:
count_vectorizer = CountVectorizer()

In [10]:
# fit the vectorizer to the data i.e. Build a Vocabulary, Count the number of occurences of a word etc..
count_vectorizer.fit(X_train)
# transform the text data into a vector
X_train_count = count_vectorizer.transform(X_train)

In [11]:
# Does the fitting and transforming in 1 step
X_train_vect = count_vectorizer.fit_transform(X_train)

In [12]:
X_train_vect.shape

(21440, 18986)

In [13]:
tfidf_transformer = TfidfTransformer()

In [14]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_vect)

In [15]:
svm = LinearSVC()

In [16]:
svm.fit(X_train_tfidf, y_train)

In [17]:
clf_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('classify', LinearSVC())])

In [18]:
clf_pipeline.fit(X_train, y_train)

In [19]:
predictions = clf_pipeline.predict(X_test)

In [20]:
print(confusion_matrix(y_test, predictions))

[[5144  119]
 [ 148 5149]]


In [21]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      5263
           1       0.98      0.97      0.97      5297

    accuracy                           0.97     10560
   macro avg       0.97      0.97      0.97     10560
weighted avg       0.97      0.97      0.97     10560



In [22]:
dump(clf_pipeline, 'svm_clickbait_clf.joblib')

['svm_clickbait_clf.joblib']

In [27]:
indo = clf_pipeline.predict(['Powerful Earthquake Kills at Least 162 in Indonesia; Toll Expected to Mount'])

In [28]:
indo

array([0])