In [1]:
import jsonlines
import re
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import linear_model
from sklearn.svm import LinearSVC
import random
# from tqdm import tqdm

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [3]:
count = 0
train_data = []
test_data = []
with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        train_data.append(obj)

random.shuffle(train_data)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

In [4]:
train_data_df = pd.DataFrame.from_dict(train_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train = pd.merge(train_data_df, truth_data_df, on="id")

In [5]:
df = []
y = []

In [6]:
vals = train.values.tolist()
final_vals = []
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][2], vals[i][4], vals[i][5], vals[i][6], vals[i][7], vals[i][8], vals[i][9]])

vals_df = pd.DataFrame(final_vals, columns=["postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])

In [7]:
test_data = final_vals[int(0.9 * len(final_vals)):]
final_vals = final_vals[0:int(len(final_vals)*0.9)] #Training and validation data

In [8]:
print(len(final_vals))

9275


In [9]:
for i in final_vals:
    if(i[6]=="clickbait"):
        y.append(1)
    else:
        y.append(0)

In [10]:
for i in range(len(final_vals)):
    text = []
    for j in range(0,6):
        k = final_vals[i][j]
        if (j == 2 or j == 3):
            text.append(k)
        else:
            text+=k
    words = ""
    for string in text:
        string = clean_str(string)
        words +=" ".join(string.split())
    df+=[words]

In [11]:
vectorizer = CountVectorizer(input='content', lowercase=False, analyzer='word', stop_words='english')
X = vectorizer.fit_transform(df)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)
print(X_tfidf.shape)

(9275, 263538)


In [None]:
clf = linear_model.LinearRegression()

In [12]:
clf = LinearSVC(random_state=0)

In [13]:
clf.fit(X_tfidf, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [20]:
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
# cross_validate(clf, X_train_tfidf, y, cv=5, scoring='accuracy')
predicted = cross_val_predict(clf, X_tfidf, y, cv=5)

In [21]:
print("5-fold Cross Validation Accuracy", accuracy_score(y, predicted.round()))

5-fold Cross Validation Accuracy 0.80948787062


In [16]:
y_test = []
for i in test_data:
    if(i[6]=="clickbait"):
        y_test.append(1)
    else:
        y_test.append(0)

In [17]:
df_test =[]

for i in range(len(test_data)):
    text = []
    for j in range(0,6):
        k = test_data[i][j]
        if (j == 2 or j == 3):
            text.append(k)
        else:
            text+=k
    words = ""
    for string in text:
        string = clean_str(string)
        words +=" ".join(string.split())
    df_test+=[words]

In [18]:
predicted = []
for t in df_test:
    test_X = vectorizer.transform([t])
    X_test_tfidf = tfidf_transformer.transform(test_X)
    predicted.append(clf.predict(X_test_tfidf).round())

In [19]:
scores = accuracy_score(y_test, predicted)
print("Test Data Accuracy ", scores)

Test Data Accuracy  0.80213385063
