## Question 1

In [99]:
# Import necessary packages
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## First, reuse all the created function in question 1-4

In [42]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f: #add 'encoding = utf-8' to original load_data function to avoid loading error
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

In [43]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary label
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement)
    
    # the function input (data_line) should be a list, then subset the list with index 1 to get the 'label' data
    label = convert_label(data_line[1])
    # the same approach can be applied to extract 'text' data with index 2
    text = data_line[2]
    return (label, text) #return the tuple (label, text), which has the same order as in the 'load_data' function

In [44]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
    return feature_dict # Return the feature dictionary

In [45]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
#     print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

In [46]:
#cross validate ==> remove shuffle to make the result consistent for comparison
from sklearn.metrics import classification_report, accuracy_score
from random import shuffle # Import shuffle method

def cross_validate(dataset, folds):
    #shuffle(dataset) # Shuffle the input dataset
    results = [] # Create an empty list to store the result after running the function
    fold_size = int(len(dataset)/folds) + 1 # Set number of folds to run
    accuracy_rate = [] # Create list to store accuracy
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        #print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        # Set up validation dataset in cross-validation with its size being equal to 1 fold_size
        validation = dataset[i : i+fold_size]
        # Set up trainset in cross-validation by excluding the validation set from the input dataset, and take the rest
        train = dataset[:i] + dataset[i+fold_size:]
        # Train the classifier with the pipeline function created with the model initialiser (LinearSVC in this case)
        classifier = train_classifier(train)
        # Extract the ground-truth labels from the validation dataset
        yval_true = [t[1] for t in validation]
        # Implement the trained model into validation set and get predicted labels
        yval_pred = predict_labels([x[0] for x in validation], classifier)
        # Calculate precision, recall and fscore from the prediction, compared to ground truth
        final_scores = precision_recall_fscore_support(yval_true, yval_pred, average='weighted', zero_division=0) # evaluate
        # Calculate model accuracy
        accuracy = accuracy_calculate(yval_true, yval_pred)
        # Append the value to the list created at the top for storage
        results.append(final_scores)
        accuracy_rate.append(accuracy)
    # Convert list 'results' to array for easy further calculation
    cv_results = np.asarray(results)
    # Calculate the average precision score after k-fold time running
    avg_precision = np.mean(cv_results[:, 0], axis = 0)
    # Calculate the average recall score after k-fold time running
    avg_recall = np.mean(cv_results[:, 1], axis = 0)
    # Calculate the average f1 score after k-fold time running
    avg_fscore = np.mean(cv_results[:, 2], axis = 0)
    # Calculate the average accuracy after k-fold time running
    avg_accuracy = np.mean(accuracy_rate)
    print('\n')
    print('After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: ')
    return avg_precision, avg_recall, avg_fscore, avg_accuracy #return all the values needed

In [47]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [48]:
# Calculate the model accuracy given a classifier
def accuracy_calculate(y_true, y_pred):
    correct = 0
    for a, b in zip(y_true, y_pred):
        if a==b: correct += 1
        else: pass
    return correct/len(y_true)

# Test different tokenizing method

## Original method from previous questions (used for benchmarking): use string split (same func and operations performed in question 1 - 4)

In [50]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = text.split() # string split method used for tokenizing
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    return tokens_list # Return the token_list for further use

# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')
print('\n')
print('Cross validation result  with new tokenizing method using word_tokenize from nltk package')
cross_validate(train_data, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10240 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10240 rawData, 8192 trainData, 2048 testData
Training Samples: 
8192
Features: 
21678


Cross validation result  with new tokenizing method using word_tokenize from nltk package


After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5684319703304088, 0.5683827946653851, 0.5681037779509728, 0.568382794665385)

## Modify pre_process function: use word_tokenize method from nltk package, instead of str split

In [52]:
raw_data = []         
train_data = []        
test_data = []   

# Use a word_tokenize method to tokenize the text into a list of tokens
# Import word_tokenize method from nltk package
from nltk.tokenize import word_tokenize
def pre_process(text):
    tokens_list = [] # Create an empty token list
    token = word_tokenize(text) # Tokenize the input text
    for i in token: # Loop each value throughout the token 
        tokens_list.append(i) # Append the value into the token list created at the beginning
    return tokens_list # Return the token_list for further use

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5646281013006186,
 0.5641241139012376,
 0.5641779103549835,
 0.5641241139012376)

In [102]:
import re

In [127]:
tokens_list = []
for i in raw_data:
    token = re.compile(r'\W').split(str(i))
#     a = re.split(token, str(i))
#     for a in token:
#         if a == '': 
#             continue
#         else:
#             tokens_list.append(a)
token

['',
 '',
 'The',
 'Department',
 'of',
 'Veterans',
 'Affairs',
 'has',
 'a',
 'manual',
 'out',
 'there',
 'telling',
 'our',
 'veterans',
 'stuff',
 'like',
 '',
 '',
 'Are',
 'you',
 'really',
 'of',
 'value',
 'to',
 'your',
 'community',
 '',
 '',
 'You',
 'know',
 '',
 'encouraging',
 'them',
 'to',
 'commit',
 'suicide',
 '',
 '',
 '',
 '',
 'FAKE',
 '',
 '']

In [110]:
import re
raw_data = []         
train_data = []        
test_data = []   

# Use a word_tokenize method to tokenize the text into a list of tokens
# Import word_tokenize method from nltk package
from nltk.tokenize import word_tokenize
def pre_process(text):
    tokens_list = [] # Create an empty token list
    token = re.compile(r'\W').split(str(text)) # Tokenize the input text
    for i in token: # Loop each value throughout the token 
        if i == '':
            continue
        else:
            tokens_list.append(i) # Append the value into the token list created at the beginning
    return tokens_list # Return the token_list for further use

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6076188402394452, 0.6128210981617206, 0.604708478073426, 0.6128210981617206)

**With word_token method used, the performance is not improved, so we skip this modification, still go with the traditional str.split() and proceed next one**

## Tokenizing by str split and remove non-text feature (punctuation, special characters, digits, etc.)

In [55]:
raw_data = []         
train_data = []        
test_data = []        

def pre_process(text):
    tokens_list = []
    token = text.split()
    for i in token:
        if i.isalpha(): # check if a string only contain texts
            tokens_list.append(i)
    return tokens_list

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.560150056869122, 0.5609365613360567, 0.5600316778462293, 0.5609365613360567)

**Removing non-text feature does not improve score, so we will skip this**

## Tokenizing by str split and remove stopwords (too common words)

In [56]:
raw_data = []         
train_data = []        
test_data = []        

from nltk.corpus import stopwords

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    stopword = set(stopwords.words('english')) # Store stopwords for further check
    token = text.split()
    for i in token:
        if i.lower() not in stopword: # check if token is within stopword, if not, then add as features
            tokens_list.append(i)
    return tokens_list

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5643471719646004,
 0.5641048900636789,
 0.5639305289758509,
 0.5641048900636789)

**With stopwords removal, the performance does not remove, so we will not keep this**

## Tokenizing by str split + normalise words (lowercase)

In [65]:
raw_data = []         
train_data = []        
test_data = []        


# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5696076277893829, 0.569471945212063, 0.569173100600496, 0.569471945212063)

**Normalization improves the model performance, incorporate this into pre-processing stage**

## Tokenizing by str split and normalization + lemmatise words

In [61]:
raw_data = []         
train_data = []        
test_data = []        

from nltk.stem import WordNetLemmatizer

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    lemmatizer = WordNetLemmatizer() # Initialize lemmatizer
    token = text.split()
    for i in token:
        i = i.lower()
        i = lemmatizer.lemmatize(i) # Lemmatize
        tokens_list.append(i)
    return tokens_list

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5680802033679234,
 0.5682836717529736,
 0.5678002929891937,
 0.5682836717529737)

**Lemmatization does not improve score, hence not including it in the model**

## Tokenizing by str split and normalization + stem words

In [62]:
raw_data = []         
train_data = []        
test_data = []        

from nltk import PorterStemmer

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    stem = PorterStemmer() # Initialize stemmer
    token = text.split()
    for i in token:
        i = i.lower()
        i = stem.stem(i) # Stem
        tokens_list.append(i)
    return tokens_list

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5658962877881699,
 0.5645914934518803,
 0.5649028886590568,
 0.5645914934518803)

**Stemming does not improve score, hence not including it in the model**

## Try to incorporate everything method above: Tokenizing by str split and remove stopwords (too common words) + normalise/lemmatise/stem words

In [63]:
raw_data = []         
train_data = []        
test_data = []        

from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    stop_word = set(stopwords.words('english'))
    stem = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    token = text.split()
    for i in token:
        if i.lower() not in stop_word:
            i = i.lower()
            i = stem.stem(i)
            i = lemmatizer.lemmatize(i)
            tokens_list.append(i)
    return tokens_list

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5616198497963472,
 0.5606902559173376,
 0.5607683748550507,
 0.5606902559173376)

**With all method included, the performance of the model still does not improve, hence, the most optimal preprocessing method up to now is:**
- *Tokenizing by the traditional str.split(), using any whitespace as separator (default setting)*
- *Implementing normalization, converting all tokens into lowercase form*

# LinearSVC tunning

## Go with optimal tokenizing method + Tune the regularizing cost model hyperparameter C

In [66]:
from sklearn.model_selection import GridSearchCV

parameters = {'C': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-10, 0.25, 0.5, 1, 2, 2.5, 5, 10, 20, 25, 30, 50, 100]}
svc =  LinearSVC(max_iter=100000) #set max_iter to avoid converge warning issue
svc_cv = GridSearchCV(svc, parameters, cv = 10) #run GridSearchCV with cv = 10
# train_data_X = [t[0] for t in train_data] # Extract feature from train_data
# train_data_y = [t[1] for t in train_data] # Extract true labels from train_data
# svc_cv.fit(train_data_X, train_data_y)
SklearnClassifier(svc_cv).train(train_data)
print('Best hyperparameter setting: {0}.'.format(svc_cv.best_estimator_))
print('Average accuracy across folds of best hyperparameter setting: {0}.'.format(svc_cv.best_score_))

Best hyperparameter setting: LinearSVC(C=0.01, max_iter=100000).
Average accuracy across folds of best hyperparameter setting: 0.6058343012001549.


In [74]:
# Try again with more detailed C within a suitable range

parameters = {'C': np.linspace(1e-1, 1e-3, 200).tolist()} # using numpy.linspace to create value list with smaller, detailed values
svc =  LinearSVC(max_iter=100000)
svc_cv = GridSearchCV(svc, parameters, cv = 10)
# train_data_X = [t[0] for t in train_data] # Extract feature from train_data
# train_data_y = [t[1] for t in train_data] # Extract true labels from train_data
# svc_cv.fit(train_data_X, train_data_y)
SklearnClassifier(svc_cv).train(train_data)
print('Best hyperparameter setting: {0}.'.format(svc_cv.best_estimator_))
print('Average accuracy across folds of best hyperparameter setting: {0}.'.format(svc_cv.best_score_))

Best hyperparameter setting: LinearSVC(C=0.007964824120603023, max_iter=100000).
Average accuracy across folds of best hyperparameter setting: 0.608519461568242.


**With optimal C hyperparameter, the model has been significant improved**

## Go with optimal tokenizing method + Use the optimal model hyperparameter C + set class_weight hyperparameter to 'balanced' 

In [75]:
# Try with balanced class _weight

parameters = {'C': [0.007964824120603023], 'max_iter': [100000]}
svc =  LinearSVC(class_weight='balanced')
svc_cv = GridSearchCV(svc, parameters, cv = 10)
# train_data_X = [t[0] for t in train_data] # Extract feature from train_data
# train_data_y = [t[1] for t in train_data] # Extract true labels from train_data
# svc_cv.fit(train_data_X, train_data_y)
SklearnClassifier(svc_cv).train(train_data)
print('Best hyperparameter setting: {0}.'.format(svc_cv.best_estimator_))
print('Average accuracy across folds of best hyperparameter setting: {0}.'.format(svc_cv.best_score_))

Best hyperparameter setting: LinearSVC(C=0.007964824120603023, class_weight='balanced', max_iter=100000).
Average accuracy across folds of best hyperparameter setting: 0.5996091307066916.


In [78]:
#applied best parameter gained from GridSearchCV above: C = 0.007964824120603023, with optimal preprocessing method
raw_data = []         
train_data = []        
test_data = []        


# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

def train_classifier(data):
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023))])
    return SklearnClassifier(pipeline).train(data)

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5994161458195213,
 0.6057419199807762,
 0.5959581834212833,
 0.6057419199807762)

**Balanced class_weight seems not to be a good choice**

# Feature extract

## Try adding controls on feature selection: minimum token frequency > 2, with most updated pre_process and optimal model hyperparameter C
The reason is the features currently are presented by unigrams, which has a long tail distribution; hence, setting minimum token frequency could be a good idea to consider

In [88]:
# set min frequency 
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
    for a in feature_dict.copy():
        if global_feature_dict[a] < 2:
                del feature_dict[a]
    return feature_dict # Return the feature dictionary

In [89]:
#applied best parameter gained from GridSearchCV above
raw_data = []         
train_data = []        
test_data = []        


# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

def train_classifier(data):
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023))])
    return SklearnClassifier(pipeline).train(data)

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.601818669157328, 0.6078114862429412, 0.5980890168376389, 0.6078114862429412)

**Score is lifted, agree to add min frequency > 2**

## Try adding length of the sentence, with most updated preprocess/optimal model hyperparameter C/min.frequency >2

In [82]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
    for a in feature_dict.copy():
        if global_feature_dict[a] < 2:
                del feature_dict[a]
    feature_dict['sent_len'] = float(len(tokens)) # Add sentence length as new feature
    return feature_dict # Return the feature dictionary

In [85]:
#applied best parameter gained from GridSearchCV above
raw_data = []         
train_data = []        
test_data = []        


def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

def train_classifier(data):
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023, max_iter=100000))])
    return SklearnClassifier(pipeline).train(data)

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.5997576306801301, 0.6059870239096479, 0.596319947846663, 0.6059870239096479)

**Result is not lifted after add text length as a new feature, dont consider adding length of sentence**

## Try adding count of non-text word, with most updated preprocess/optimal model hyperparameter C/min.frequency >2

In [90]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    nontext_count = 0
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
        if not i.isalpha():
            nontext_count += 1
    for a in feature_dict.copy():
        if global_feature_dict[a] < 2:
                del feature_dict[a]
    feature_dict['nontext_count'] = nontext_count # Add non-text count as new feature
    return feature_dict # Return the feature dictionary

In [91]:
#applied best parameter gained from GridSearchCV above
raw_data = []         
train_data = []        
test_data = []        


def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

def train_classifier(data):
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023, max_iter=100000))])
    return SklearnClassifier(pipeline).train(data)

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6020411143379807, 0.6080517842124233, 0.598107828827725, 0.6080517842124233)

**After adding non-text count, the model is better than previous a bit, so I decide to keep this feature**

## After testing around, below is what has been successfully tested and has helped to improve the score
1. Tokenizing by the tradition str.split(), with separator = any whitespace
2. Token normalization
3. Adding minimum frequency for token: min = 2
4. In model hyperparameter, setting a regularizing cost hyperparameter C = 0.007964824120603023 (optimal C after tunning with GridSearch CV)
5. Adding non-text count as a feature

# Final model with optimal preprocessing method after testing different scenarios

In [98]:
#applied best parameter gained from GridSearchCV above
raw_data = []         
train_data = []        
test_data = []        


def pre_process(text): # optimal preprocess: str.split() + token normalization
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens_list = []
    token = text.split()
    for i in token:
        i = i.lower()
        tokens_list.append(i)
    return tokens_list

def train_classifier(data): # optimal version: set C ~ 0.008
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.007964824120603023, max_iter=100000))])
    return SklearnClassifier(pipeline).train(data)

global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens): # optimal feature extract: set min token frequency and add non-text count as new feature
    feature_dict = {} # Created an empty dictionary (local feature dict) to store the vocab
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    nontext_count = 0
    for i in tokens: # Loop the input tokens. 
        #For each token, look up in the global dictionary to see if it exists
        if i not in global_feature_dict: # If no, then assign value 1 for that token and add it to the global dictionary
            global_feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the global dictionary
            global_feature_dict[i] = float(global_feature_dict[i] + 1)
        #For each token, look up in the feature dictionary to see if it exists (same approach as above)
        if i not in feature_dict:  # If no, then assign value 1 for that token and add it to the feature dictionary
            feature_dict[i] = 1
        else: # If yes, then take the values (weights) to add 1 more (values + 1) and update it into the feature dictionary
            feature_dict[i] = float(feature_dict[i] + 1)
        if not i.isalpha():
            nontext_count += 1
    for a in feature_dict.copy():
        if global_feature_dict[a] < 2:
                del feature_dict[a]
    feature_dict['nontext_count'] = nontext_count
    return feature_dict # Return the feature dictionary

data_file_path = 'fake_news.tsv'

load_data(data_file_path) 

split_and_preprocess_data(0.8)

cross_validate(train_data, 10)



After folding throughout cross-val process, the average score of precision - recall - f1score - accuracy is: 


(0.6020411143379807, 0.6080517842124233, 0.598107828827725, 0.6080517842124233)

# Apply the model on test set to evaluate the performance

In [97]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({'we': 1, 'have': 1, 'invested': 1, 'record': 1, 'funding': 1, 'in': 1, 'protecting': 1, 'our': 1, 'environment.': 1, 'nontext_count': 1}, 'FAKE')
Done training!
Precision: 0.601904
Recall: 0.605469
F Score:0.596231
