# STAT 696 Final Project - Live Demo

# Summary

Here we include code for the in-class demo of our model.

# Import Modules & Data

In [1]:
import pandas as pd
import numpy as np
import joblib
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest

In [2]:
# load demo articles here, store in pandas dataframe
articles_demo = pd.DataFrame({'text': ['The shipping off of American pork to China during a pandemic caused by the Chinese as the UN predicts famines of biblical proportions shouldn’t sit well with any American. Meanwhile, China just reported another case of African swine fever, just one in a dozen cases in the last 2 months, devastating the Chinese herd and increasing demand from U.S. pig farms. And now, the Chinese Coronavirus threatens American food producers. China\'s Coronavirus is nothing short of an act of war disguised in no-fault propaganda, putting millions out of work and decimating small businesses while bringing our education system to a standstill and undermining our food supply while China openly celebrates the death of American Democracy.']})
articles_demo

Unnamed: 0,text
0,The shipping off of American pork to China dur...


# Clean Text

In [3]:
# define a function to clean text data

# to convert contractions picked up by word_tokenize() into full words
contractions = {
    "n't": 'not',
    "'ve": 'have',
    "'s": 'is', # note that this will include possessive nouns
    'gonna': 'going to',
    'gotta': 'got to',
    "'d": 'would',
    "'ll": 'will',
    "'re": 'are',
    "'m": 'am',
    'wanna': 'want to'
}

# to convert nltk_pos tags to wordnet-compatible PoS tags
def convert_pos_wordnet(tag):
    tag_abbr = tag[0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
                
    if tag_abbr in tag_dict:
        return tag_dict[tag_abbr]

# Set list of "valid" tags such that when normalizing text, all words tagged with PoS = coordinating conjunction,
# cardinal digit, determiner, existential there, preposition/subordinating conjunction, list marker, predeterminer,
# possessive ending, personal pronoun, possessive pronoun, to, or interjection are dropped.
valid_tags_abbr = 'FJMNRVW'

def clean_text(str_list, lemmatize = True):
    clean_list = []
    
    for text in str_list:
        # to drop any internet domains, email addresses, or political rep. "tags"
        text = re.sub(r'(https?://)?\w+@?\w+(\.\w+)+|\([DRI]-[A-Z]{2}\)', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        for word in words:
            PoS_tag = pos_tag([word])[0][1]
            word = re.sub(r'[_-]', '', word)
            
            # to change contractions to full word form
            if word in contractions:
                word = contractions[word]
            
            # drop words with fewer than 2 characters; drop any punctuation "words"; drop words not in
            # approved set of PoS tags (defined above)
            if (len(word) > 1) and (re.match(r'^\w+$', word)) and (PoS_tag[0].upper() in valid_tags_abbr):

                if lemmatize:
                    lemmatizer = WordNetLemmatizer()

                    if PoS_tag[0].upper() in 'JNVR':
                        word = lemmatizer.lemmatize(word, convert_pos_wordnet(PoS_tag))
                    else:
                        word = lemmatizer.lemmatize(word)
            
                clean_words.append(word)
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [4]:
articles_demo['clean_txt'] = clean_text(articles_demo['text'])

# PoS Tags

In [5]:
# create new column in dataframe with each cell = text replaced with PoS tags
pos_column = []
for el in articles_demo['text']:
    words = word_tokenize(el)
    tags = []
    for word in words:
        if word in contractions:
            word = contractions[word]
        tag = pos_tag([word])[0][1]
        tags.append(tag)
    pos_column.append(' '.join(tags))
articles_demo['PoS_tags'] = pos_column

# Sentiment Scorer

In [6]:
hi4_df = pd.read_excel('inquireraugmented.xls')

In [7]:
pos_df = hi4_df[hi4_df.Positiv=="Positiv"]
neg_df = hi4_df[hi4_df.Negativ =="Negativ"]

In [8]:
pos_list = pos_df['Entry'].tolist()
neg_list = neg_df['Entry'].tolist()

In [9]:
pos_word = [ ]
for i in range(0, len(pos_list)):
    pos_word.append( re.sub(r'[^A-Z]', "", pos_list[i]))
pos_word=set(pos_word)

In [10]:
neg_word = [ ]
for i in range(0, len(neg_list)):
    neg_word.append( re.sub(r'[^A-Z]', "",str(neg_list[i]))) 
neg_word=set(neg_word)

In [11]:
def sentiment_scorer(text_input, in_list):
    words_set = set(in_list)
    text_input = text_input.upper().split(' ')
    score = 0
    for i in text_input:
        if i in words_set:
            score += 1
    score = score/len(text_input)
    return(score)

In [12]:
articles_demo['pos_sent']=articles_demo['clean_txt'].apply(lambda x: sentiment_scorer(x, pos_word))

In [13]:
articles_demo['neg_sent']=articles_demo['clean_txt'].apply(lambda x: sentiment_scorer(x, neg_word))

In [14]:
articles_demo['net_sent'] = articles_demo['pos_sent'] - articles_demo['neg_sent']

# Punctuation Vectorizer

In [15]:
# creating CV matrix for punctuation(!?:;)
class Punc_Vec:
    def __init__(self,
                 analyzer = 'char',
                 binary = False,
                 decode_error = 'strict',
                 dtype = np.int64,
                 encoding = 'utf-8',
                 input = 'content',
                 lowercase = False,
                 max_df = 1.0,
                 min_df = 1,
                 ngram_range = (1,1),
                 max_features = None,
                 strip_accents = None,
                 preprocessor = None,
                 tokenizer = None,
                 stop_words = None,
                 vocabulary = None,
                 token_pattern = '(?u)\b\w\w+\b'):
        self.analyzer = analyzer
        self.binary = binary
        self.decode_error = decode_error
        self.dtype = dtype
        self.encoding = encoding
        self.input = input
        self.lowercase = lowercase
        self.max_df = max_df
        self.min_df = min_df
        self.ngram_range = ngram_range
        self.max_features = max_features
        self.strip_accents = strip_accents
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.vocabulary = vocabulary
        self.token_pattern = token_pattern
        self.c = None
        self.v = CountVectorizer(analyzer = self.analyzer,
                                 binary = self.binary,
                                 decode_error = self.decode_error,
                                 dtype = self.dtype,
                                 encoding = self.encoding,
                                 input = self.input,
                                 lowercase = self.lowercase,
                                 max_df = self.max_df,
                                 min_df = self.min_df,
                                 ngram_range = self.ngram_range,
                                 max_features = self.max_features,
                                 strip_accents = self.strip_accents,
                                 preprocessor = self.preprocessor,
                                 tokenizer = self.tokenizer,
                                 stop_words = self.stop_words,
                                 vocabulary = self.vocabulary,
                                 token_pattern = self.token_pattern)
        self.params = None
        self.f = None
        self.t = None
        self.f_t = None
        self.g = None
    def get_params(self, deep = True):
        self.params = self.v.get_params(deep)
        return(self.params)
    def fit(self, corpus, y = None):
        self.c = corpus.tolist()
        punc_only = []
        for i in range(len(self.c)):
            punc_only.append([])
            punc_only[i] = re.sub(r"[^!?:;]", "", self.c[i])
        self.f = self.v.fit(punc_only)
    def transform(self, corpus):
        self.t = self.v.transform(corpus.tolist())
        return(self.t)
    def fit_transform(self, corpus, y = None):
        self.c = corpus.tolist()
        punc_only = []
        for i in range(len(self.c)):
            punc_only.append([])
            punc_only[i] = re.sub(r"[^!?:;]", "", self.c[i])
        self.f_t = self.v.fit_transform(punc_only)
        return self.f_t
    def get_features(self):
        self.g = self.v.get_feature_names()
        return(self.g)

# Set up matrix

In [16]:
articles_demo

Unnamed: 0,text,clean_txt,PoS_tags,pos_sent,neg_sent,net_sent
0,The shipping off of American pork to China dur...,shipping American pork China pandemic cause Ch...,DT NN IN IN JJ NN TO NNP IN DT NN VBN IN DT JJ...,0.055556,0.138889,-0.083333


In [17]:
articles_demo = articles_demo[['clean_txt', 'PoS_tags', 'text', 'net_sent']]

# Feature Engineering

In [18]:
# need to use ColumnTransformer() to do different things to 'text', 'clean_txt', and 'PoS_tags' columns;
# need to use FeatureUnion() to simultaneously perform dimensionality reduction and feature selection
# on the 'clean_txt' column

# Set up feature union to do both dimensionality reduction (compare NMF & LSA) and select
# K best performing word vectors; in other words, get topics & keep important words.

# set number of topics
topics = 100

# set number of words from BoW to keep in model
top_terms = 20

nmf = NMF(n_components = topics, random_state = 42)
kbest = SelectKBest(k = top_terms)

featUn = FeatureUnion([
    ('dim_red', nmf),
    ('feat_sel', kbest)
])

# Set up pipeline for clean text: vectorize followed by simultaneous dimensionality
# reduction and feature selection.
tfidf = TfidfVectorizer(stop_words = 'english',
                        ngram_range = (1,2),
                        max_df = 0.90,
                        min_df = 2,
                        max_features = 20000)

cleanTextPipe = Pipeline([
    ('vectorize', tfidf),
    ('dimRed_plus_featSel', featUn)
])

# Set up column transformer to distinguish between vectorizing clean text and original text.
#
# For clean text, perform the following: TF-IDF vectorize; then perform dimensionality
# reduction (NMF) & feature selection (SelectKBest).
#
# For PoS text, perform the following: vectorize PoS tags (CountVectorizer).
countvec = CountVectorizer(lowercase = False)
#
# For original text, perform the following: vectorize select punctuation (CountVectorizer).
puncvec = Punc_Vec()

sep_cols = ColumnTransformer(
    # transformers
    [
        ('vectorize_punctuation', puncvec, 'text'), # perform only on text column
        ('clean_text_pipe', cleanTextPipe, 'clean_txt'), # perform only on clean_txt column
        ('vectorize_PoS', countvec, 'PoS_tags') # perform only on PoS_tags column
    ],
    # keep the 'net_sent' column with no transformations
    remainder = 'passthrough'
)

In [19]:
# Load pre-trained column transformer
fake_news_transformer = joblib.load('column_transformer.joblib')

In [20]:
# Transform the dataframe into the feature matrix for prediction
X = fake_news_transformer.transform(articles_demo)

In [21]:
# Load pre-trained model
fake_news_classifier = joblib.load('binaryEstimator_fakeNews.joblib')

In [22]:
# Predict probability "fake news"
fake_news_classifier.predict_proba(X)[0][1]

0.576

In [23]:
# Predict class
fake_news_classifier.predict(X)

array([1])