In [22]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk import word_tokenize
from sklearn import svm
from nltk.tag import StanfordNERTagger
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from nltk.tag import pos_tag
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [23]:
'''
Workflow should be as follows:
1. Read in data frame
2. Lowercase all letters (regex)
3. Address punctutation marks (regex)
?. Remove non-english titles (?)
4. Break headlines into single words (tokenize)
5. Find and remove/minimize words that are for semantics (stop words)
6. Find similar words and bin together (stemming)
''';

In [24]:
# Reference: http://zwmiller.com/projects/nlp_pipeline.html
# Reference: https://github.com/ZWMiller/nlp_pipe_manager/blob/master/nlp_pipeline_manager/nlp_preprocessor.py
# Reference: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

class nlp_pipe:
    
    # Initialize the class
    def __init__(self, vectorizer, stemmer, lemmatizer, tokenizer, dataframe, column='Title'):
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.lemmatizer = lemmatizer
        self.stemmer = stemmer
        self.dataframe = dataframe
        self.column = column
        self.dataframe[self.column] = self.dataframe[self.column].apply(str)
    
    ######################################################################
    
    # Create a cleaning method (aka fit) that will use several functions in order
    def cleaner(self):
        self.vader_sentiment()
        self.dataframe = self._remove_numbers(self.dataframe, self.column)
        self.dataframe = self._punctuation(self.dataframe, self.column)
        self.dataframe = self._dropduplicates(self.dataframe, self.column)
        self.real_words() # Check if it's a real word and then remove if not
        self.remove_single_letter() # Remove single letter words
        #self.autocorrect() # Takes a very long time to run
        self.tokenize_words()
        self.remove_short_headlines() # Remove headline if only one word
        #self.lemmatize_words()
        #self.stem_words()
        #self.named_entities()
        self.dataframe = self._join_words(self.dataframe, self.column)
        self.remove_headlines_specific_words()
        self.dataframe[self.column] = self.dataframe[self.column].replace('', np.nan,)
        self.dataframe.dropna(subset=[self.column], inplace=True)
    
    ########## Functions that 'cleaner' will call ##########
    @staticmethod
    def _remove_numbers(dataframe, column):       
        # Removes all words containing numbers
        remove_numbers = lambda x: re.sub('\w*\d\w*', '', x)
        dataframe[column] = dataframe[column].map(remove_numbers)
        return dataframe
        
    @staticmethod
    def _punctuation(dataframe, column):
        # Removes punctuation marks
        punc_lower = lambda x: re.sub('[^A-Za-z0-9]+', ' ', x)
        dataframe[column] = dataframe[column].map(punc_lower)
        return dataframe
        
    @staticmethod
    def _dropduplicates(dataframe, column):
        # Drop rows that have duplicate 'Titles'
        dataframe.drop_duplicates(subset=column, keep='first', inplace=True)
        return dataframe
    
    @staticmethod
    def _join_words(dataframe, column):
        # Joins words together with space (' ')--used after tokenization
        join_words = lambda x: ' '.join(x)
        dataframe[column] = dataframe[column].map(join_words)
        return dataframe
    
    def vader_sentiment(self):
        analyzer = SentimentIntensityAnalyzer()
        self.dataframe['Positive_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['pos'], axis=1)
        self.dataframe['Negative_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['neg'], axis=1)
        self.dataframe['Neutral_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['neu'], axis=1)
        self.dataframe['Compound_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['compound'], axis=1)
        
    def tokenize_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: self.tokenizer(x[self.column]), axis=1)
    
    def stem_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: [self.stemmer.stem(word) for word in x[self.column]], axis=1)
                                                           
    def lemmatize_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: [self.lemmatizer.lemmatize(word) for word in x[self.column]], axis=1)
        
    def named_entities(self):
        st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                               '/usr/share/stanford-ner/stanford-ner.jar',
                               encoding='utf-8')
        self.dataframe[self.column] = self.dataframe.apply(lambda x: st.tag(x[self.column]), axis=1)
        
    def real_words(self):
        # Removes words that are not within the nltk.corpus library
        words = set(nltk.corpus.words.words())
        self.dataframe[self.column] = self.dataframe.apply(lambda x: \
        " ".join(w for w in nltk.wordpunct_tokenize(x[self.column]) if w.lower() in words or not w.isalpha()), axis=1)
        
    def remove_single_letter(self):
        # Removes words that are 1 letter
        self.dataframe[self.column] = self.dataframe.apply(lambda x: ' '.join([w for w in x[self.column].split() if len(w)>2]), axis=1)
        
    def remove_short_headlines(self, min_length=3):
        # Removes headlines that are less than 3 words
        self.dataframe['headline_length'] = self.dataframe.apply(lambda x: len(x[self.column]), axis=1)
        self.dataframe = self.dataframe[self.dataframe['headline_length'] > min_length]
        self.dataframe = self.dataframe.drop(columns='headline_length')
        self.dataframe.reset_index(drop=True)
        
    def remove_headlines_specific_words(self):
        self.dataframe = self.dataframe[~self.dataframe[self.column].str.contains('onion')]
        self.dataframe = self.dataframe[~self.dataframe[self.column].str.contains('Onion')]
        
    def autocorrect(self):
        # Autocorrects words based on Levenshtein distance (takes __ minutes to run)
        self.dataframe[self.column] = self.dataframe.apply(lambda x: ''.join(TextBlob(x[self.column]).correct()), axis=1)

        
    ######################################################################

    # Create a transform method (aka vectorization)
    
    def transform(self):
        vect_series = self.vectorizer.fit_transform(self.dataframe[self.column])
        self.dataframe_vect = pd.DataFrame(vect_series.toarray(), columns=self.vectorizer.get_feature_names())
        
    ######################################################################

In [30]:
df = pd.read_csv('data/2020/onion_title_list_2020.csv')
len(df)

1414

In [31]:
nlp = nlp_pipe(dataframe = df,
               column = 'Title',
               tokenizer = nltk.word_tokenize,
               vectorizer = TfidfVectorizer(stop_words='english'),
               stemmer = SnowballStemmer("english"),
               lemmatizer = WordNetLemmatizer())

In [32]:
nlp.cleaner()

In [33]:
nlp.dataframe.shape

(1080, 6)

In [34]:
nlp.dataframe.to_csv('data/2020/onion_clean_2020.csv', index=False)

### Merging dataframes

In [1]:
df_onion = pd.read_csv('data/2020/onion_clean_2020.csv')
df_notonion = pd.read_csv('data/2020/notonion_clean_2020.csv')

NameError: name 'pd' is not defined

In [36]:
frames = [df_onion, df_notonion]

df_merge = pd.concat(frames)

df_merge.to_csv('data/2020/merge_clean_2020.csv', index = False)