## SemEval 2019 Task 4 - Extra Preprocessing Steps Exploration

Jonathan Miller and Negar Adyaniyazdi, VCU, CMSC516, Fall 2018

Goal: Basic exploratory text analysis on a random sample of 10,000 observations

In [1]:
import pandas as pd

DATA_PATH = '../data/'
DATA_PROCESSED_PATH = DATA_PATH + 'processed/'

df = pd.read_csv(DATA_PROCESSED_PATH + 'train10000_0.csv')

In [2]:
df.head()

Unnamed: 0,id,published-at,title,article_text,hyperpartisan,bias,url,labeled-by,preprocessed_text
0,1049192,2003-05-12,"Time Inc.'s Huey thinks outside the box, makes...","Time Inc.'s Huey thinks outside the box, makes...",False,least,https://poynter.org/news/time-incs-huey-thinks...,publisher,time inc huey think outside box make enemy new...
1,923492,,Texas group to open southern NM abortion clinic,Texas group to open southern NM abortion clini...,False,least,https://abqjournal.com/456723/texas-group-to-o...,publisher,texas group open southern nm abortion clinic l...
2,1277816,,Texas woman pleads guilty to sex trafficking 9...,Texas woman pleads guilty to sex trafficking 9...,False,least,https://abqjournal.com/1044515/texas-woman-ple...,publisher,texas woman plead guilty sex traffic year old ...
3,777348,2018-01-11,"17 dead in California mudslides, more than a d...","17 dead in California mudslides, more than a d...",False,least,https://apnews.com/amp/67ec5e87bbb74130b7ddedc...,publisher,dead california mudslide dozen miss montecito ...
4,89424,,County needs comprehensive effort to generate ...,County needs comprehensive effort to generate ...,False,least,https://abqjournal.com/309627/county-needs-com...,publisher,county need comprehensive effort generate need...


In [3]:
df['hyperpartisan'].value_counts(normalize=True)

True     0.50005
False    0.49995
Name: hyperpartisan, dtype: float64

In [4]:
df['bias'].value_counts(normalize=True)

least           0.277228
right           0.250025
left            0.250025
left-center     0.141214
right-center    0.081508
Name: bias, dtype: float64

In [9]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

df["tokens"] = df["preprocessed_text"].apply(tokenizer.tokenize)

In [11]:
list_labels = df["hyperpartisan"].tolist()

In [10]:
import gensim
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
DATA_EXTERNAL_PATH = DATA_PATH + 'external/'
word2vec_path = DATA_EXTERNAL_PATH + "GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [12]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_questions, generate_missing=False):
    embeddings = clean_questions['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [13]:
embeddings = get_word2vec_embeddings(word2vec, df)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, list_labels, 
                                                                                        test_size=0.2, random_state=40)

In [22]:
import sys
sys.path.append('../src/models/')

%load_ext autoreload
%autoreload 1

import models
%aimport models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import pandas
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

def run_models(model_list, X_train, X_test, y_train, y_test, random_state):

    # Set random state
    random_state = random_state
    
    # Convenience translation dictionary for printing
    model_dict ={
        'nb' : 'Multinomial Naive Bayes',
        'lr' : 'Logistic Regression',
        'gb' : 'Gradient Boosting Classifier'
    }

    # Initialize best model variables
    best_model = ''
    best_model_type = ''
    best_accuracy = 0
    
    # Iterate over list of model types
    for model_type in model_list:

        # Naive Bayes
        if model_type == 'nb':
            clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)

        # Logistic Regression
        elif model_type == 'lr':
            clf = LogisticRegression(C=30.0, class_weight='None', solver='newton-cg')
            clf.fit(X_train, y_train)

        # Gradient Boosting
        elif model_type == 'gb':
            clf = GradientBoostingClassifier(learning_rate=0.7, max_depth=6, max_leaf_nodes=None, min_samples_leaf= 3, min_samples_split=2).fit(X_train, y_train)
        else:
            raise ValueError("No model type provided")   

        # Get predictions and evaluate     
        predicted = clf.predict(X_test)
        print(model_dict[model_type])
        accuracy = evaluate_model(predicted, y_test)

        # Update best performing model if necessary
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = clf
            best_model_type = model_type

    # Print best results
    print('Best model is {} with an accuracy score of {:.4f}'.format(model_dict[best_model_type], best_accuracy))

    # Return best model and type
    return best_model, best_model_type

# Evaluate models. Print classification report with precision, recall, f1, print accuracy, and return accuracy
def evaluate_model(predicted, y_test):
    print(classification_report(y_test, predicted))
    accuracy = accuracy_score(y_test, predicted)
    print('Accuracy: {:.4f}'.format(accuracy))
    return accuracy

  from numpy.core.umath_tests import inner1d


In [27]:
run_models(['nb', 'lr'], X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec, random_state=42)

ValueError: Input X must be non-negative

In [28]:
X_train_word2vec

[array([-1.48600430e-02, -3.20465597e-02, -2.74206331e-02,  8.68283929e-02,
        -8.42495452e-02, -7.84764608e-03,  5.45698802e-02, -7.07665083e-02,
         9.70888562e-02,  9.06085756e-02, -1.06692022e-01, -7.76193237e-02,
        -3.26221975e-02,  6.65444268e-03, -7.80176247e-02,  1.08377041e-01,
         2.63190036e-02,  5.86478000e-02, -1.04003652e-02, -9.12278408e-02,
        -3.81098896e-02,  4.01069578e-02,  2.73655425e-02, -8.79118178e-03,
         5.99166955e-02,  2.11317677e-02, -6.55036174e-02,  9.78654650e-02,
         2.60248990e-02, -2.09937074e-02,  8.65527683e-03, -1.86563449e-02,
        -5.37103865e-02, -2.87829929e-03, -1.60543315e-02, -2.94879850e-02,
        -1.75384098e-03, -1.85674879e-02,  5.51749929e-02,  5.17160797e-02,
         8.36123996e-02, -1.16714986e-02,  1.17476569e-01,  1.65255695e-02,
        -5.56103855e-02, -9.32307773e-02, -2.19681644e-02, -2.44263034e-02,
        -5.01473067e-02,  2.53158654e-02,  2.46969689e-02, -5.96296522e-03,
         4.2