In [1]:
import os
dataset_filename = "../Datasets/clickbait-headlines.tsv"

print("File: {} \nSize: {} MBs".format(dataset_filename, round(os.path.getsize(dataset_filename)/1024/1024, 2)))

File: ../Datasets/clickbait-headlines.tsv 
Size: 0.55 MBs


In [2]:
import csv

data = []
labels = []

with open(dataset_filename) as f:
    reader = csv.reader(f, delimiter="\t")
    for line in reader:
        try:
            data.append(line[0])
            labels.append(line[1])
        except Exception as e:
            print(e)
        

print(data[:3])
print(labels[:3])

["Egypt's top envoy in Iraq confirmed killed", 'Carter: Race relations in Palestine are worse than apartheid', 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires']
['0', '0', '1']


In [3]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data)
print("The dimensions of our vectors:")
print(vectors.shape)
print("- - -")


The dimensions of our vectors:
(10000, 13169)
- - -
CPU times: user 906 ms, sys: 203 ms, total: 1.11 s
Wall time: 1.06 s


In [4]:
print("The data type of our vectors")
print(type(vectors))
print("- - -")
print("The size of our vectors (MB):")
print(vectors.data.nbytes/1024/1024)
print("- - -")
print("The size of our vectors in dense format (MB):")
print(vectors.todense().nbytes/1024/1024)
print("- - - ")
print("Number of non zero elements in our vectors")
print(vectors.nnz)
print("- - -")

The data type of our vectors
<class 'scipy.sparse.csr.csr_matrix'>
- - -
The size of our vectors (MB):
0.6759414672851562
- - -
The size of our vectors in dense format (MB):
1004.7149658203125
- - - 
Number of non zero elements in our vectors
88597
- - -


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2)

print(X_train.shape)
print(X_test.shape)

(8000, 13169)
(2000, 13169)


In [6]:
%%time

from sklearn.svm import LinearSVC

svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)

predictions = svm_classifier.predict(X_test)

CPU times: user 38.9 ms, sys: 9.69 ms, total: 48.6 ms
Wall time: 54.1 ms


In [7]:
print("prediction, label")
for i in range(10):
    print(y_test[i], predictions[i])

prediction, label
0 0
1 1
0 0
1 1
0 1
1 1
1 1
1 1
0 0
0 0


In [8]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: {}\n".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))


Accuracy: 0.9635

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       963
           1       0.97      0.96      0.96      1037

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

