In [7]:
# pos tagging for fake news statement
## use pos-tagging to build features


from pycorenlp import StanfordCoreNLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk
import pandas as pd
import numpy as np


# from sklearn.multiclass import OneVsRestClassifier

## set up stanfordcorenlp server
nlp = StanfordCoreNLP('http://localhost:9000')

# Importing the dataset
dataset=pd.read_csv('train.tsv',delimiter='\t',encoding='utf-8')
dataset.columns=['statement_ID','label','statement','subject','speaker','job_title',
           'state_info','pantry_affiliation','barely_true_cnt','false_cnt',
           'half_true_cnt','mostly_true_cnt','pants_on_fire_cnt','context']


dataset.head()

total_rows = len(dataset.index)
cols = list(dataset.columns.values)

# Cleaning the texts
corpus = []
for i in range(0, total_rows):
    review = re.sub('[^a-zA-Z]', ' ', dataset['statement'][i])
    review = review.lower()
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)

train_data=corpus
## define key-value
data_dict=dict()

data_dict['ids'] = train_data[['ID']].values[:,0]
data_dict['labels'] = train_data[['Label']].values[:,0]
data_dict['statements'] = train_data[['Statement']].values[:,0]
data_dict['subjects'] = train_data[['Subject']].values[:,0]
data_dict['contexts'] = train_data[['Context']].values[:,0]
# Process each sentence
# Obtain the Part Of Speech for every word in the statement
# Get POS bigram way
# Get POS trigram way

word_pos_list = list()
pos_list = list()
unigram_pos_list=list()
bigram_pos_list = list()
trigram_pos_list = list()

for sent_id, txt in enumerate(data_dict['text']):
    print('Procesing sentence number {0}: {1}'.format(sent_id,txt))
    output = nlp.annotate(txt, properties={
      'annotators': 'tokenize,pos,parse',
      'outputFormat': 'json'
      })
    result_word_pos = str()
    result_pos = list()
    unigram_pos=list()
    bigram_pos = list()
    trigram_pos = list()
    for o in output['sentences']:
        result_pos.append('<s>')
        for t in o['tokens']:
            result_word_pos += '{0}/{1} '.format(t['word'],t['pos'])
            result_pos.append('{0}'.format(t['pos']))
        for rpIndex, rp in enumerate(result_pos):
            if rpIndex==len(result_pos):
                unigram_pos.append('{0}').format(rp, result_pos[rpIndex+1])
            if rpIndex < len(result_pos)-1 and len(result_pos)>=2:
                bigram_pos.append('{0} {1}'.format(rp, result_pos[rpIndex+1]))
            if rpIndex < len(result_pos)-2 and len(result_pos)>=3:
                trigram_pos.append('{0} {1} {2}'.format(rp, result_pos[rpIndex+1], result_pos[rpIndex+2]))

    word_pos_list.append(result_word_pos)
    pos_list.append(result_pos)
    unigram_pos_list.append(unigram_pos)
    bigram_pos_list.append(bigram_pos)
    trigram_pos_list.append(trigram_pos)

##
data_dict['word_pos'] = word_pos_list
data_dict['pos'] = pos_list
data_dict['unigram'] = unigram_pos_list
data_dict['bigrams'] = bigram_pos_list
data_dict['trigrams'] = trigram_pos_list

##
dataset['Word POS'] = data_dict['word_pos']
dataset['POS'] = data_dict['pos']
dataset['unigram_pos'] = data_dict['unigram']
dataset['bigrams_pos'] = data_dict['bigrams']
dataset['trigram_pos'] = data_dict['trigrams']

## use countVectorizor to build postag output as features for training ML model
cv = CountVectorizer()

pos_uni_feats = cv.fit_transform(dataset['unigram_pos']).toarray()
pos_big_feats = cv.fit_transform(dataset['bigram_pos']).toarray()
pos_trig_feats = cv.fit_transform(dataset['trigram_pos']).toarray()

X_train, X_test, y_train, y_test  = train_test_split(pos_uni_feats,
        dataset['class'],train_size=0.8, random_state=123)

## train bigram_pos features on logistic regression
LR = LogisticRegression()
LR = LR.fit(X=X_train, y=y_train)
y_pred = LR.predict(X_test)
print("accuracy metrics for logistic regression classifier:\n",classification_report(y_test, y_pred))

### train features on multinomial naive bayes classifier
nb_cly=OneVsRestClassifier(MultinomialNB()).fit(X=X_train, y=y_train)
y_pred=nb_cly.predict(X_test)
print("accuracy metrics for Multinomial naive bayes classifier:\n",classification_report(y_test, y_pred))

### train features on SVM

svm_clf=SVC(C=1, kernel='rbf', degree=3, gamma='auto', random_state=None)
svm_clf=svm_clf.fit(X=X_train, y=y_train)
y_pred=svm_clf.predict(X_test)
print("accuracy metrics for support vector machine classifier:\n",classification_report(y_test, y_pred))

### train features on SGD
sgd_clf = linear_model.SGDClassifier(max_iter=1000)
sgd_clf=sgd_clf.fit(X=X_train, y=y_train)
y_pred=sgd_clf.predict(X_test)
print("accuracy metrics for stochastic gardient desent classifier:\n",classification_report(y_test, y_pred))

### train features on random forest
randFor_clf=RandomForestClassifier(n_estimators=2, criterion='gini', max_features='auto', class_weight={1:.9, 2:.5, 3:.01})
randFor_clf=randFor_clf.fit(X=X_train, y=y_train)
y_pred=randFor_clf.predict(X_test)
print("accuracy metrics for random forest classifier:\n",classification_report(y_test, y_pred))


##----------- More compact solution for training several emprical ML classfier model -----------##
### use kfold cross validation
## Kforld cross validation on training set
kf = KFold(len(X_train), numFolds=10, shuffle=True)

X_train, X_test, y_train, y_test  = train_test_split(pos_uni_feats,
        dataset['class'],train_size=0.8, random_state=123)
Y=dataset['class']

params = [{}, {'loss': 'log', 'penalty': 'l2', 'n_iter':1000},class_weight={1:.9, 2:.5, 3:.01}]
Models = [LogisticRegression, SGDClassifier, MultinomialNB, RandomForestClassifier, SVC]

for param, Model in zip(params, Models):
    total = 0
    for train_indices, test_indices in kf:
        train_X = X_train[train_indices, :]; train_Y = y_train[train_indices]
        test_X = X_test[test_indices, :]; test_Y = y_test[test_indices]
        reg = Model(**param)
        reg.fit(train_X, train_Y)
        predictions = reg.predict(test_X)
        total += accuracy_score(test_Y, predictions)

    accuracy = total / numFolds
    print("Accuracy score of {0}: {1}: {2}: {3}: {4}".format(Model.__name__, accuracy))
    print("accuracy metrics for {0}: {1}: {2}: {3}: {4}".format(Model.__name__, classification_report(y_test, predictions)))

  from numpy.core.umath_tests import inner1d


FileNotFoundError: File b'train.tsv' does not exist