In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [2]:
# load data
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')
submission = pd.read_csv('sample_submission_dataset.csv')

In [3]:
# show top five rows of train data
train.head() 

Unnamed: 0,headline,clickbait,ID
0,"Hussein enters ""not guilty"" plea at trial",0,84698cc7-8ae2-4ea3-a425-b7091561cee6
1,Iraq peace talks draw to a close in Finland,0,a4e35ca4-15fa-43e8-b68e-91457b23afee
2,British Premier Visits Northern Ireland,0,162991ee-ea2f-41ad-a753-649a68f54311
3,The Decline of Left-Handed First Basemen,0,2cd3aa32-6ec2-4af1-bd1d-560709066b8b
4,Who Said It: Donald Trump Or Kanye West,1,72553370-c348-4603-882b-39e04b610c39


In [4]:
# show top five rows of test data
test.head()

Unnamed: 0,ID,headline
0,5f99b099-c4db-4a02-9753-28c5e94a6b34,Israeli military launches airstrikes into Gaza...
1,3c413552-32c0-4000-a745-b4217fe427ca,Expelled' fair use upheld
2,71060e3b-bab0-4218-b1ce-8284ae46f6c3,31 Times Frankie Boyle's Twitter Was Out Of Co...
3,f0a03121-600f-4b69-b6db-989d0f3cf28a,What Does Your Zodiac Sign Say About Your Love...
4,456f7cfa-bdfe-45bd-9e88-7c4ae53eb4ba,Larson B ice-shelf collapse reveals exotic org...


In [6]:
# show top five rows of submision file
submission.head()

Unnamed: 0,ID,clickbait
0,84698cc7-8ae2-4ea3-a425-b7091561cee6,0
1,a4e35ca4-15fa-43e8-b68e-91457b23afee,1
2,162991ee-ea2f-41ad-a753-649a68f54311,0


In [7]:
# evalute Labels distribution
train.clickbait.value_counts()

0    15210
1    15190
Name: clickbait, dtype: int64

In [8]:
#split features and target from train data 
X = train["headline"]
y = train.clickbait.values

In [9]:
#test
final_test = test["headline"]

In [10]:
# Transform text data and the text is not cleaned
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

#transform test
test_vec = vectorizer.transform(final_test)

In [11]:
n_folds = 5
random_state=42
F1_Scores = []
final_prediction=[]

kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
for train_index, test_index in kf.split(X_vec, y):
    X_train, X_test = X_vec[train_index], X_vec[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    clf = MultinomialNB(alpha = 0.1)
    clf.fit(X_train, y_train)

    # Make predictions and calculate f1_score
    y_pred = clf.predict(X_test)
    
    #predict on test
    test_preds=clf.predict(test_vec)
    F1_score = f1_score(y_test, y_pred)
    
    final_prediction.append(test_preds)
    F1_Scores.append(F1_score)

    print(f"Accuracy: {F1_score:.5f}")

Accuracy: 0.98027
Accuracy: 0.97819
Accuracy: 0.97890
Accuracy: 0.98121
Accuracy: 0.98011


In [12]:
average_accuracy = sum(F1_Scores) / len(F1_Scores)
print(f"f1 score: {average_accuracy:.3f}")

f1 score: 0.980


In [13]:
#stack final prediction and get it's mean
prediction=np.mean(np.column_stack(final_prediction),axis=1)

In [14]:
prediction.shape

(15200,)

In [15]:
submission.head()

Unnamed: 0,ID,clickbait
0,84698cc7-8ae2-4ea3-a425-b7091561cee6,0
1,a4e35ca4-15fa-43e8-b68e-91457b23afee,1
2,162991ee-ea2f-41ad-a753-649a68f54311,0


In [16]:
# create submission file
sub = pd.DataFrame({"ID": test['ID'], "clickbait": prediction.astype('int16')})

In [17]:
# show sample submissoin rows
sub.head() 

Unnamed: 0,ID,clickbait
0,5f99b099-c4db-4a02-9753-28c5e94a6b34,0
1,3c413552-32c0-4000-a745-b4217fe427ca,0
2,71060e3b-bab0-4218-b1ce-8284ae46f6c3,1
3,f0a03121-600f-4b69-b6db-989d0f3cf28a,1
4,456f7cfa-bdfe-45bd-9e88-7c4ae53eb4ba,0


In [18]:
sub.clickbait.value_counts()

0    7688
1    7512
Name: clickbait, dtype: int64

In [20]:
# save submission file 
sub.to_csv("Baseline_submission.csv",index=False) 