In [4]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
import nltk
import csv
from collections import Counter
from sklearn import datasets, linear_model, model_selection, svm, tree
import matplotlib.pylab as plt
from tqdm import tqdm
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [5]:
def read_data(path, delimiter='|'):
    data = []
    with open(path, 'r') as file:
        reader = csv.reader(file, delimiter=delimiter)
        for row in reader:
            data.append(row)
    return data

In [6]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to C:\Users\Albin
[nltk_data]     Siriniqi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Albin Siriniqi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to C:\Users\Albin
[nltk_data]     Siriniqi\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [7]:
def extract(text):
    if len(text) > 30:
        features = {}

        features["punctuation_count"] = len(re.findall('[^\w\s\'\"]{3,}', text))

        tokens = nltk.word_tokenize(text)
        tokens_count = len(tokens)

        features["tokens_count"] = tokens_count

        pos_tag = nltk.pos_tag(tokens, tagset='universal')
        pos_tag = [i for i in pos_tag if (len(i[0]) != 1) or (i[1] == '.') or (i[0] == 'a')]

        all_words = [i[0] for i in pos_tag if i[1] != '.']

        length_sum = sum(len(w) for w in all_words)

        features["average_word_length"] = length_sum / len(all_words)

        tag_frequency_dist = nltk.FreqDist(tag for (word, tag) in pos_tag)

        features["tag_frequency_dist"] = tag_frequency_dist

        words = [w[0] for w in pos_tag if (w[1] in ['ADJECTIVE', 'ADVERB', 'NOUN', 'VERB', 'X']) and (len(w[0]) > 1) ]
        words_count = len(words)

        features["words_count"] = words_count

        features["unique_words_total"] = len(Counter(words).keys())

        uppercase_total = sum(1 for _ in filter(lambda str: str.isupper(), words))

        features["uppercase_fraction"] = uppercase_total/ words_count

        tb = TextBlob(text)
        features["polarity"] = tb.polarity
        features["subjectivity"] = tb.subjectivity

        return features

    else:
        return 0

In [16]:
def dataframe_extract(data, label):
    features = []

    for idx in tqdm(range(len(data))):
        article_features = extract(data[idx][1])
        selected_features = {
            'label': label,
            'tokens_count': article_features["tokens_count"],
            'words_count': article_features["words_count"], 
            'unique_words_count': article_features["unique_words_count"], 
            'polarity_score': article_features["polarity"], 
            'subjectivity_score': article_features["subjectivity"],
            'uppercase_fraction': article_features["uppercase_fraction"],
            'average_word_length': article_features["average_word_length"],
            'punctuation_count': article_feature['punctuation_count'],
            'adjective_frequenct': article_features["tag_fd"].freq("ADJECTIVE"), 
            'adv_freq': article_features["tag_frequency_dist"].freq("ADVERB"),
            'noun_freq': article_features["tag_frequency_dist"].freq("NOUN"),
            'verb_freq': article_features["tag_frequency_dist"].freq("VERB"),
            'other_freq': article_features["tag_frequency_dist"].freq("X"),
            'vocabulary_uniqueness': article_features["unique_words_total"]/article_features["words_count"]
        }

        data_features.append(selected_features)

    df = pd.DataFrame.from_dict(data_features)

    return df

In [17]:
real_data_test = read_data('Testing_dataset.csv')
real_data_train = read_data('Datasets/Training_dataset.csv')
fake_data_test = read_data('Datasets/zlo_test_dataset.csv')
fake_data_train = read_data('Datasets/zlo_train_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Testing_dataset.csv'

In [18]:
df_real_test = dataframe_extract(real_data_test, 1)
df_real_train = dataframe_extract(real_data_train, 1)
df_fake_test = dataframe_extract(fake_data_test, -1)
df_fake_train = dataframe_extract(fake_data_train, -1)

NameError: name 'real_data_test' is not defined

In [19]:
test_dataframe = pd.concat([df_real_test,df_fake_test])
train_dataframe= pd.concat([df_real_train, df_fake_train])

NameError: name 'df_real_test' is not defined

In [20]:
dataframe = pd.concat([test_df, train_df], ignore_index = True)

NameError: name 'test_df' is not defined

In [22]:
X = np.array(training_dataframe[training_dataframe.columns[1:]])
Y = np.array(training_dataframe['label'])

testX = np.array(testing_dataframe[testing_dataframe.columns[1:]])
testY = np.array(testing_dataframe['label'])

L = 100
K = np.ones(Y.shape)

J = []

Y_scores = 0
train_errors = []
train_loss = []

test_errors = []
testY_scores = 0


complete_scores = []


for l in range(L):
    f_t = tree.DecisionTreeClassifier(max_depth=1).fit(X, Y, K)
    error_t = ((f_t.predict(X) != Y) * K).mean()
    if err_t > 0.5:
        print("No weak classifier found")
        break
    alpha_t = 0.5 * np.log((1 - error_t) / err_t)
    J.append((alpha_t, f_t))
    K *= np.exp(-alpha_t * Y * f_t.predict(X))

    Y_scores += alpha_t * f_t.predict(X)
    train_errors.append((np.sign(Y_scores) != Y).mean())
    train_loss.append(np.exp(-Y_scores * Y).mean())

    testY_scores += alpha_t * f_t.predict(testX)
    test_errors.append((np.sign(testY_scores) != testY).mean())
    complete_scores.append(testY_scores)


plt.figure()
plt.plot(train_errors, label="train")
plt.plot(test_errors, label="test")
plt.xlabel("AdaBoost")
plt.ylabel("Error rate")
_ = plt.legend()
test_errors = np.array(test_errors)
I = test_errors.argsort()
total_smallest_errors = (test_errors[I[0]])
total_errors = test_errors[-1]
True_Positive = ((np.sign(complete_scores[I[0]]) == 1) & (testY == 1)).sum()
False_Positive = ((np.sign(complete_scores[I[0]]) == 1) & (testY == -1)).sum()
True_Negative = ((np.sign(complete_scores[I[0]]) == -1) & (testY == -1)).sum()
False_Negative = ((np.sign(complete_scores[I[0]]) == -1) & (testY == 1)).sum()

print("Size of test datasets: %.2f" % (True_Positive + False_Positive + False_Negative + True_Negative))
print("True positives total: %.2f " % True_Positive)
print("False positives total: %.2f " % False_Positive)
print("True negatives total: %.2f " % True_Negative)
print("False negatives total: %.2f " % False_Negative)
print("Error: %.3f " % total_errors)
print("Smallest error: %.3f " % total_smallest_errors)
print("Train error: %.3f " % train_errors[-1])

NameError: name 'training_dataframe' is not defined