In [20]:
# pos tagging for fake news statement
## use pos-tagging to build features
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk

# import utility libraries
import util
import preprocessing
import importlib
import string

# libraries for model testing and selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [5]:
import nlp_util

In [6]:
importlib.reload(preprocessing)

NLP_Task ready to use.


<module 'preprocessing' from 'D:\\UIC\\Fall 2018\\Statistical NLP\\Project\\jurat-aldo-project\\cs-521-project.git\\source\\CS-521-PROJECT\\preprocessing.py'>

# Load training files

In [7]:
# load files
tr_file, va_file, te_file = util.load_files()
tr_dict = util.tsv_to_dict(tsv_file=tr_file)
va_dict = util.tsv_to_dict(tsv_file=va_file)

# Preprocess data

In [8]:
## Data without punctuation
tr_data_no_punctuation = preprocessing.clean_text(sentences=tr_dict['statement'], remove_punctuation=True)
va_data_no_punctuation = preprocessing.clean_text(sentences=va_dict['statement'], remove_punctuation=True)

## Data without punctuation and uppercases
tr_data_no_punct_upper = preprocessing.clean_text(sentences=tr_dict['statement'], remove_punctuation=True, lower_case= True)
va_data_no_punct_upper = preprocessing.clean_text(sentences=va_dict['statement'], remove_punctuation=True, lower_case= True)

## Data without punctuation, uppercases and stopwords
tr_data_no_punct_upper_stopw = preprocessing.clean_text(sentences=tr_dict['statement'], remove_punctuation=True, lower_case=True, stop_words=True)
va_data_no_punct_upper_stopw = preprocessing.clean_text(sentences=va_dict['statement'], remove_punctuation=True, lower_case=True, stop_words=True)

# Extracting POS tags grouped by unigrams, bigrams and trigrams

In [10]:
# POS extracted for no preprocessed data
unigram_pos, bigrams_pos, trigram_pos = preprocessing.extract_POS(tr_dict['statement'])
unigram_pos_va, bigrams_pos_va, trigram_pos_va = preprocessing.extract_POS(statements=va_dict['statement'])

# POS without punctuation
unigram_pos_no_p, bigrams_pos_no_p, trigram_pos_no_p = preprocessing.extract_POS(tr_data_no_punctuation)
unigram_pos_no_p_va, bigrams_pos_no_p_va, trigram_pos_no_p_va = preprocessing.extract_POS(va_data_no_punctuation)

# POS withoutn punctuation and with lower case
unigram_pos_no_p_u, bigrams_pos_no_p_u, trigram_pos_no_p_u = preprocessing.extract_POS(tr_data_no_punct_upper)
unigram_pos_no_p_u_va, bigrams_pos_no_p_u_va, trigram_pos_no_p_u_va = preprocessing.extract_POS(va_data_no_punct_upper)

# POW without punctuation, lower case and with no stop words
unigram_pos_no_p_u_sw, bigrams_pos_no_p_u_sw, trigram_pos_no_p_u_sw = preprocessing.extract_POS(tr_data_no_punct_upper_stopw)
unigram_pos_no_p_u_sw_va, bigrams_pos_no_p_u_sw_va, trigram_pos_no_p_u_sw_va = preprocessing.extract_POS(va_data_no_punct_upper_stopw)


Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished
Extracting POS Tags
Finished


In [11]:
importlib.reload(nlp_util)
nlp_task = nlp_util.NLP_Task()

NLP_Task ready to use.


In [12]:
# Unique POS by different preprocessing type of text
# Unigram of unclean data
unigram_list_tr = nlp_task.UniquePosTags(unigram_pos)
unigram_list_va = nlp_task.UniquePosTags(unigram_pos_va)

# Unigram of no punctuation data
unigram_list_tr_no_p = nlp_task.UniquePosTags(unigram_pos_no_p)
unigram_list_va_no_p = nlp_task.UniquePosTags(unigram_pos_no_p_va)

# Unigram of no punctuation and with lower case
unigram_list_tr_no_p_u = nlp_task.UniquePosTags(unigram_pos_no_p_u) 
unigram_list_va_no_p_u = nlp_task.UniquePosTags(unigram_pos_no_p_u_va)

# Unigram of no punctuation, without lower case and without stop words
unigram_list_tr_no_p_u_sw = nlp_task.UniquePosTags(unigram_pos_no_p_u_sw)
unigram_list_va_no_p_u_sw = nlp_task.UniquePosTags(unigram_pos_no_p_u_sw_va)

# Create new labels grouping different classes in the dataset

In [13]:
binary_labels_tr = np.array(preprocessing.create_labels(labels=tr_dict['label'],label_values={'false':1, 'true':-1,'pants-fire':1,'barely-true':1,'half-true':0,'mostly-true':-1}))
binary_labels_va = np.array(preprocessing.create_labels(labels=va_dict['label'],label_values={'false':1, 'true':-1,'pants-fire':1,'barely-true':1,'half-true':0,'mostly-true':-1}))

tr_indexes = [i for i,x in enumerate(binary_labels_tr) if x!=0]
va_indexes = [i for i,x in enumerate(binary_labels_va) if x!=0]

In [19]:
# use countVectorizor to build postag output as features for training ML model
## uni_vocabulary: vocabulary=['cd','jjr','vb','jjs','nnp','in','vbp','to','rb','vbg','md','jj','dt']
## bi_vocabulary: vocabulary=['NNP NNP', 'IN DT', 'JJR IN', 'CD NN', 'IN CD', 'CD NNS', 'DT JJS']
## tri_vocabulary: ['NNP NNP NNP','CD NN IN','VBZ NNP NNP','IN DT NN','IN DT JJ','NN IN DT','<s> VBZ NNP','JJR IN CD','NNP NNP VBD','DT JJ CD','NNS IN DT']

   # Train machine learning models with different preprocessing data

In [65]:
pruned_features_1 = []
pruned_features_2 = []

In [84]:
#
training_data = [unigram_pos, unigram_pos_no_p, unigram_pos_no_p_u, unigram_pos_no_p_u_sw] 
testing_data = [unigram_pos_va, unigram_pos_no_p_va, unigram_pos_no_p_u_va, unigram_pos_no_p_u_sw_va]
#training_data = [bigrams_pos, bigrams_pos_no_p, bigrams_pos_no_p_u, bigrams_pos_no_p_u_sw] 
#testing_data = [bigrams_pos_va, bigrams_pos_no_p_va, bigrams_pos_no_p_u_va, bigrams_pos_no_p_u_sw_va]
testing_title = ['RAW DATA', 'NO PUNCTUATION', 'NO PUNCTUATION LOWER CASE', 'NO PUNCTUATION LC STOP WORDS']

trda = training_data[1:2]
teda = testing_data[1:2]
tetil = testing_title[1:2]

for i, each_data in enumerate(trda):
    print("TRAINING POSTags:", tetil[i])
    print("Testing with user defined vectors")
    Xtr, Xte = PreparePOSDataForTraining(each_data, teda[i], use_built_in_vectors=False,user_defined_vocabulary=unigram_list_tr_no_p)
    Xtr = Xtr[tr_indexes]
    Xte = Xte[va_indexes]
    Ytr = binary_labels_tr[tr_indexes]
    Yte = binary_labels_va[va_indexes]
    #pruned_features_1 = FeatureSelector(Xtr, Xte, Ytr, Yte, unigram_list_tr_no_p,max_depth=6, threshold=0.00001)
    #print(pruned_features_1)
    DecisionTreeFeaturesSelector(Xtr, Xte, Ytr, Yte)
    #TrainModels(Xtr, Xte, Ytr, Yte)
    print("Testing with sklearn built in vectors")
    Xtr, Xte = PreparePOSDataForTraining(each_data, teda[i], use_built_in_vectors=True,user_defined_vocabulary=unigram_list_tr_no_p)
    Xtr = Xtr[tr_indexes]
    Xte = Xte[va_indexes]
    Ytr = binary_labels_tr[tr_indexes]
    Yte = binary_labels_va[va_indexes]
    #FeatureSelector(Xtr, Xte, Ytr, Yte, unigram_list_tr_no_p,max_depth=6,threshold=0.000001)
    DecisionTreeFeaturesSelector(Xtr, Xte, Ytr, Yte)
    #TrainModels(Xtr, Xte, Ytr, Yte)
    

TRAINING POSTags: NO PUNCTUATION
Testing with user defined vectors
Accuracy:  0.6438223938223938
             precision    recall  f1-score   support

         -1       0.59      0.42      0.49       420
          1       0.67      0.80      0.73       616

avg / total       0.63      0.64      0.63      1036

VBZ 0.03675686719754296
DT 0.029988763672937353
NNPS 0.029744948243489285
VBP 0.035678250862883995
JJ 0.01673452984379914
IN 0.034637454287928984
WRB 0.008787453029583624
VBD 0.019837055476423143
PRP 0.01720870742459847
RP 0.00309903014464449
WDT 0.015923495335051833
VB 0.09304912862253181
NNP 0.06951052888157801
VBG 0.04064254276118422
PRP$ 0.005857735168362604
VBN 0.02718188325028323
CD 0.22206490656861025
RB 0.07659126256463986
WP 0.004375101380674572
JJS 0.07049077766738678
JJR 0.1096812091659285
EX 0.0175591632631623
RBS 0.00695613784919607
FW 0.003139243335096094
LS 0.004503824002482545
['VBZ', 'DT', 'NNPS', 'VBP', 'JJ', 'IN', 'WRB', 'VBD', 'PRP', 'RP', 'WDT', 'VB', 'NNP', 

In [43]:
def DecisionTreeFeaturesSelector(Xtr, Xte, Ytr, Yte):
    for i in range(1,10):
        print('max_depth',i)
        clf = DecisionTreeClassifier(max_depth=i)
        clf.fit(Xtr, Ytr)
        pred = clf.predict(Xte)
        print('Accuracy: ', np.mean(Yte == pred))
        print(classification_report(Yte, pred))        

In [83]:
def FeatureSelector(Xtr, Xte, Ytr, Yte,feature_list,max_depth = 1,threshold=0.1):
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf.fit(Xtr, Ytr)
    pred = clf.predict(Xte)
    print('Accuracy: ', np.mean(Yte == pred))
    print(classification_report(Yte, pred)) 
    
    pos_Tags = list()
    for i,each_f in enumerate(clf.feature_importances_):
        if each_f >=threshold:
            pos_Tags.append(feature_list[i])
            print(feature_list[i], each_f)
    print(pos_Tags)
    return (clf.feature_importances_<=threshold) # return filetered features
        
    

# Function to vectorize data

In [28]:
#use_built_in_vectors => if True use CountVectorizer, otherwise it uses the version of pos_vectors in preprocessing
def PreparePOSDataForTraining(training_data, testing_data, use_built_in_vectors = False, user_defined_vocabulary=None):
    tr_feats = []
    te_feats = []
    if(use_built_in_vectors == False):
        tr_feats, returned_dict = preprocessing.pos_vectors(training_data, vector_dictionary= user_defined_vocabulary, return_dictionary=True)
        te_feats = preprocessing.pos_vectors(testing_data, vector_dictionary= returned_dict)
    else:
        if user_defined_vocabulary != None:
            user_defined_vocabulary = [x.lower() for x in user_defined_vocabulary]
        training_str = [" ".join(x) for x in training_data]
        testing_str = [" ".join(x) for x in testing_data]
        tr_vectorizer = CountVectorizer(vocabulary=user_defined_vocabulary,binary=True)
        tr_feats = tr_vectorizer.fit_transform(training_str).toarray()
        te_vectorizer = CountVectorizer(vocabulary=tr_vectorizer.get_feature_names(),binary=True)
        te_feats = te_vectorizer.fit_transform(testing_str).toarray()
    return tr_feats, te_feats

# Training Different Models 

In [16]:
def TrainModels(Xtrain, Xtest, Ytrain, Ytest):
    kf = KFold(n_splits=10, shuffle=True)
    # ,'class_weight':{1:.9, 2:.5, 3:.01}
    params = [{'max_depth':1,'criterion':'entropy'},{}, {'loss': 'log', 'penalty': 'l2', 'max_iter':1000},{},{},{}]
    Models = [DecisionTreeClassifier,LogisticRegression, linear_model.SGDClassifier, MultinomialNB, RandomForestClassifier, SVC]
    accuracy_list = list()
    accuracy_metrics = list()
    for param, Model in zip(params, Models):
        total = 0
        for train_indices, test_indices in kf.split(Xtrain):
            train_X = Xtrain[train_indices, :]; train_Y = Ytrain[train_indices]
            test_X = Xtrain[test_indices, :]; test_Y = Ytrain[test_indices]
            reg = Model(**param)
            reg.fit(train_X, train_Y)
            predictions = reg.predict(test_X)
            total += accuracy_score(test_Y, predictions)
        accuracy = total / kf.n_splits
        reg = Model(**param)
        reg.fit(Xtrain, Ytrain)
        predictions = reg.predict(Xtest)
        accuracy_list.append((Model.__name__, accuracy))
        accuracy_metrics.append((Model.__name__, classification_report(Ytest, predictions)))

    for i, value in enumerate(accuracy_list):
        print("Accuracy score of {0}: {1}".format(value[0],value[1]))
        print("accuracy metrics for {0}:\n{1}".format(accuracy_metrics[i][0], accuracy_metrics[i][1]))

#print("Accuracy score of {0}: {1}: {2}: {3}: {4}".format(accuracy_list[0], accuracy_list[]))
#print("accuracy metrics for {0}: {1}: {2}: {3}: {4}".format(Model.__name__, classification_report(y_test, predictions)))