# Question 6

In [1]:
# Import necessary packages
import csv                               # csv reader
from sklearn.svm import LinearSVC
!pip install nltk
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np

In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

In [3]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary label
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement)
    
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    return (label, text) #return the tuple (label, text), which has the same order as in the 'load_data' function

In [4]:
def pre_process(text): # optimal preprocess: str.split() + token normalization
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

In [5]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens): # optimal feature extract: set min token frequency and add non-text count as new feature
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    nontext_count = 0
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
        if not i.isalpha():
            nontext_count += 1
    for a in feature_dict.copy():
        if global_feature_dict[a] < 2:
                del feature_dict[a]
    feature_dict['nontext_count'] = nontext_count
    return feature_dict # Return the feature dictionary

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    #print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023, max_iter=100000))])
    return SklearnClassifier(pipeline).train(data)

In [7]:
#solution
from sklearn.metrics import classification_report, accuracy_score
from random import shuffle # Import shuffle method

def cross_validate(dataset, folds):
    #shuffle(dataset) # Shuffle the input dataset
    results = [] # Create an empty list to store the result after running the function
    fold_size = int(len(dataset)/folds) + 1 # Set number of folds to run
    accuracy_rate = [] # Create list to store accuracy
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        #print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        # Set up validation dataset in cross-validation with its size being equal to 1 fold_size
        validation = dataset[i : i+fold_size]
        # Set up trainset in cross-validation by excluding the validation set from the input dataset, and take the rest
        train = dataset[:i] + dataset[i+fold_size:]
        # Train the classifier with the pipeline function created with the model initialiser (LinearSVC in this case)
        classifier = train_classifier(train)
        # Extract the ground-truth labels from the validation dataset
        yval_true = [t[1] for t in validation]
        # Implement the trained model into validation set and get predicted labels
        yval_pred = predict_labels([x[0] for x in validation], classifier)
        # Calculate precision, recall and fscore from the prediction, compared to ground truth
        final_scores = precision_recall_fscore_support(yval_true, yval_pred, average='weighted', zero_division=0) # evaluate
        # Calculate model accuracy
        accuracy = accuracy_calculate(yval_true, yval_pred)
        # Append the value to the list created at the top for storage
        results.append(final_scores)
        accuracy_rate.append(accuracy)
    # Convert list 'results' to array for easy further calculation
    cv_results = np.asarray(results)
    # Calculate the average precision score after k-fold time running
    avg_precision = np.mean(cv_results[:, 0], axis = 0)
    # Calculate the average recall score after k-fold time running
    avg_recall = np.mean(cv_results[:, 1], axis = 0)
    # Calculate the average f1 score after k-fold time running
    avg_fscore = np.mean(cv_results[:, 2], axis = 0)
    # Calculate the average accuracy after k-fold time running
    avg_accuracy = np.mean(accuracy_rate)
    print('\n')
    print('After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: ')
    return avg_precision, avg_recall, avg_fscore, avg_accuracy #return all the values needed

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [9]:
# Calculate the model accuracy given a classifier
def accuracy_calculate(y_true, y_pred):
    correct = 0
    for a, b in zip(y_true, y_pred):
        if a==b: correct += 1
        else: pass
    return correct/len(y_true)

In [10]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10240 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10240 rawData, 8192 trainData, 2048 testData
Training Samples: 
8192
Features: 
20066


In [11]:
cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6020411143379807, 0.6080517842124233, 0.598107828827725, 0.6080517842124233)

## try adding 'subject'

In [12]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a) = parse_data_line(line)
            raw_data.append((text, label, a))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6033591709839616,
 0.6093119542306534,
 0.5999688513222489,
 0.6093119542306534)

**Adding 'subject' improves the model performance ==> agree to add 'subject' as a new feature and use it in later test**

## try adding 'speaker'

In [14]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b) = parse_data_line(line)
            raw_data.append((text, label, a, b))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6253115238232914,
 0.6297018970189703,
 0.6207918593539445,
 0.6297018970189702)

**Adding 'speaker' improves the model performance ==> agree to add 'speaker' as a new feature and use it in later test**

## try adding 'speaker job title'

In [15]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c) = parse_data_line(line)
            raw_data.append((text, label, a, b, c))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.625518037399394, 0.6300828063836195, 0.6217800944443649, 0.6300828063836192)

**Adding 'speaker_job_title' improves the model performance ==> agree to add 'speaker_job_title' as a new feature and use it in later test**

## try adding 'state_info"

In [16]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6280273987469114,
 0.6324074074074074,
 0.6248137623173349,
 0.6324074074074074)

**Adding 'state_info' improves the model performance ==> agree to add 'state_info' as a new feature and use it in later test**

## try adding 'party affiliation'

In [17]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6291005653286841,
 0.6332625715146041,
 0.6265810659238757,
 0.6332625715146041)

**Adding 'party_affiliation' improves the model performance ==> agree to add 'party_affiliation' as a new feature and use it in later test**

## try adding 'total barely true counts'
this would be tokenized as 'barely_true', weighted by the value of 'total_barely_true_counts': **value * 'barely_true'**

In [18]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6449585907368421, 0.648383017163505, 0.642911387555957, 0.648383017163505)

**Adding 'total_barely_true_counts' improves the model performance ==> agree to add 'total_barely_true_counts' as a new feature and use it in later test**

## try adding 'total false counts'
this would be tokenized as 'false', weighted by the value of 'total_false_counts': **value * 'false'**

In [19]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6648219488955837, 0.6673215898825656, 0.662893803225596, 0.6673215898825655)

**Adding 'total_false_counts' improves the model performance ==> agree to add 'total_false_counts' as a new feature and use it in later test**

## try adding 'total half true counts'
this would be tokenized as 'half_true', weighted by the value of 'total_half_true_counts': **value * 'half_true'**

In [20]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false, half_true) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g, h) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g, h))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g, h) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g, h) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6803477441695904,
 0.6824510689551342,
 0.6787633537392791,
 0.6824510689551342)

**Adding 'total_half_true_counts' improves the model performance ==> agree to add 'total_half_true_counts' as a new feature and use it in later test**

## try adding 'total mostly true counts'
this would be tokenized as 'mostly_true', weighted by the value of 'total_mostly_true_counts': **value * 'mostly_true'**

In [21]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false, half_true, mostly_true) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g, h, i) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g, h, i))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g, h, i) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g, h, i) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6951906957819507, 0.6967359229147847, 0.693008206784248, 0.6967359229147847)

**Adding 'total_mostly_true_counts' improves the model performance ==> agree to add 'total_mostly_true_counts' as a new feature and use it in later test**

## try adding 'total pants on fire counts'
this would be tokenized as 'pants_on_fire', weighted by the value of 'total_pants_on_fire_counts': **value * 'pants_on_fire'**

In [28]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false, half_true, mostly_true, pants_on_fire) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g, h, i, m) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g, h, i, m))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g, h, i, m) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g, h, i, m) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.7027500582915027,
 0.7041960252935862,
 0.7011088368027659,
 0.7041960252935862)

**Adding 'total_pants_on_fire_counts' improves the model performance ==> agree to add 'total_pants_on_fire_counts' as a new feature and use it in later test**

## try adding 'context'

In [29]:
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false, half_true, mostly_true, pants_on_fire, context) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g, h, i, m, n) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g, h, i, m, n))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g, h, i, m, n) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire' + ' ' + n
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g, h, i, m, n) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire' + ' ' + n
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.701559412165166, 0.7033212887684432, 0.700293969178744, 0.7033212887684432)

**Adding 'context' does not improve the model performance ==> skip 'context'**

# Final Model
## After testing around, I decide to add these attributes as new features, besides the existing ones:
1. subject
2. speaker
3. speaker_job_title
4. state_info
5. party_affiliation
6. total_barely_true_counts
7. total_false_counts
8. total_half_true_counts
9. total_mostly_true_counts
10. total_pants_on_fire_counts

In [30]:
#final model with selected features and modified preprocessing and adjusted parameters (taken from question 5)
def parse_data_line(data_line):
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_title = data_line[5]
    state = data_line[6]
    party = data_line[7]
    barely_true = data_line[8]
    false = data_line[9]
    half_true = data_line[10]
    mostly_true = data_line[11]
    pants_on_fire = data_line[12]
    context = data_line[13]
    return (label, text , subject, speaker, speaker_title , state, party, barely_true, false, half_true, mostly_true, pants_on_fire) #return the tuple (label, text), which has the same order as in the 'load_data' function

def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            elif len(line) < 14:
                continue
            (label, text , a, b, c, d, e, f, g, h, i, m) = parse_data_line(line)
            raw_data.append((text, label, a, b, c, d ,e, f, g, h, i, m))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label, a, b, c, d, e, f, g, h, i, m) in raw_data[:num_training_samples]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire'
        train_data.append((to_feature_vector(pre_process(new_text)),label))
    for (text, label, a, b, c, d, e, f, g, h, i, m) in raw_data[num_training_samples:]:
        new_text = text + ' ' + a + ' ' + b + ' ' + c + ' ' + d + ' ' + e + ' ' + int(f)*'barely_true' + ' ' + int(g)*'false' + ' ' + int(h)*'half_true' + ' ' + int(i)*'mostly_true' + ' ' + int(m)*'pants_on_fire'
        test_data.append((to_feature_vector(pre_process(new_text)),label))
        
#MAIN
raw_data = []         
train_data = []        
test_data = []        

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.7027500582915027,
 0.7041960252935862,
 0.7011088368027659,
 0.7041960252935862)

# Apply the model on test set to evaluate the performance

In [31]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({'the': 2.0, 'bush': 1, 'tax': 1, 'cuts': 1, 'helped': 1, 'to': 1, 'create': 1, 'a': 1, 'substantial': 1, 'part': 1, 'of': 1, 'deficit.': 1, 'bush-administration,deficit,taxes': 1, 'dennis-kucinich': 1, 'u.s.': 1, 'representative': 1, 'ohio': 1, 'democrat': 1, 'barely_true': 1, 'falsefalsefalse': 1, 'half_truehalf_truehalf_truehalf_true': 1, 'mostly_truemostly_truemostly_truemostly_truemostly_truemostly_true': 1, 'nontext_count': 7}, 'REAL')
Done training!
Precision: 0.684335
Recall: 0.685059
F Score:0.681368
