## Text Preprocessing

In [85]:
import re
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook as tqdm
import unicodedata

import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

np.random.seed(101)
rand_seed = 101

In [86]:
%%time
ROOT = "./csv"
df_nbc = pd.read_csv(
    ROOT + "/tweets.csv", 
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

df_scraped = pd.read_csv(
    ROOT + "/scraped_tweets.csv", 
    encoding='utf-8',
    nrows = 2000000,
    low_memory=False, 
    parse_dates=False
)

Wall time: 11.3 s


In [87]:
del df_nbc['posted']

print(df_nbc.shape)
print(df_scraped.shape)

(203451, 15)
(2000000, 15)


In [88]:
# Ensure all columns have the same datatypes
df_nbc[[
    'user_id', 
    'retweet_count', 
    'favorite_count', 
    'tweet_id', 
    'retweeted_status_id',
    'created_at',
    'in_reply_to_status_id'
]] = df_nbc[[
    'user_id', 
    'retweet_count', 
    'favorite_count', 
    'tweet_id', 
    'retweeted_status_id',
    'created_at',
    'in_reply_to_status_id'
]].fillna(0).astype(np.int64)

df_nbc[['user_key', 'text']] = df_nbc[['user_key', 'text']].astype('str')
df_nbc[['retweeted']] = df_nbc[['retweeted']].astype('bool')

df_scraped[[
    'retweeted_status_id',
    'in_reply_to_status_id'
]] = df_scraped[[
    'retweeted_status_id',
    'in_reply_to_status_id'
]].fillna(0).astype(np.int64)

print(df_nbc.dtypes)
print(df_scraped.dtypes)

print(list(df_nbc.dtypes) == list(df_scraped.dtypes))

user_id                   int64
user_key                 object
created_at                int64
created_str              object
retweet_count             int64
retweeted                  bool
favorite_count            int64
text                     object
tweet_id                  int64
source                   object
hashtags                 object
expanded_urls            object
mentions                 object
retweeted_status_id       int64
in_reply_to_status_id     int64
dtype: object
user_id                   int64
user_key                 object
created_at                int64
created_str              object
retweet_count             int64
retweeted                  bool
favorite_count            int64
text                     object
tweet_id                  int64
source                   object
hashtags                 object
expanded_urls            object
mentions                 object
retweeted_status_id       int64
in_reply_to_status_id     int64
dtype: object
True


In [89]:
print(df_nbc.head())
print("\n\n\n")
print(df_scraped.head())

      user_id         user_key     created_at          created_str  \
0  2532611755        kathiemrr  1488207240000  2017-02-27 14:54:00   
1  2531159968   traceyhappymom  1471272620000  2016-08-15 14:50:20   
2           0    evewebster373  1435701369000  2015-06-30 21:56:09   
3  4840551713      blacktolive  1474013088000  2016-09-16 08:04:48   
4  1694026190  jacquelinisbest  1474227985000  2016-09-18 19:46:25   

   retweet_count  retweeted  favorite_count  \
0              0       True               0   
1              0       True               0   
2              0       True               0   
3             18      False              17   
4              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  61600230657274

### Prepare the data

What we need to do is this:

- Attach a class feature to the nbc dataset
- Since we do not know the class of the scraped dataset, we leave it for now
- Create a new dataset merged between a subset of nbc and scraped datasets to be our training set
- All rows left out of the merged subset will become the test subset

In [90]:
df_nbc['class'] = 1
df_test = df_scraped.copy()
df_test['class'] = np.nan

# Merge df_test and df_nbc
# train/test/val split
# apply classifiers
df = pd.concat([df_nbc, df_test], ignore_index = True)
print(df.head())
print(df.dtypes)
print(len(df.index))

      user_id         user_key     created_at          created_str  \
0  2532611755        kathiemrr  1488207240000  2017-02-27 14:54:00   
1  2531159968   traceyhappymom  1471272620000  2016-08-15 14:50:20   
2           0    evewebster373  1435701369000  2015-06-30 21:56:09   
3  4840551713      blacktolive  1474013088000  2016-09-16 08:04:48   
4  1694026190  jacquelinisbest  1474227985000  2016-09-18 19:46:25   

   retweet_count  retweeted  favorite_count  \
0              0       True               0   
1              0       True               0   
2              0       True               0   
3             18      False              17   
4              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  61600230657274

- Iterate through each row
- Tokenize text
- Normalize text
- Stem/Lemma text
- Save entire row to new spreadsheet

In [91]:
import csv

headers = list(df.columns.values)
headers.extend(['tokenized_text', 'stem_text', 'lemma_text'])
with open(".\\csv\\mergedtweets.csv", 'w', encoding = 'utf-8') as file:
        writer = csv.DictWriter(file, fieldnames = headers, lineterminator = '\n')
        writer.writeheader()

In [92]:
tt = TweetTokenizer()

def preprocess_text(tokenized_text):
    """
    Accepts an array of strings
    Handles all the preprocessing of the tokenized text before stemming/lemma
    - removes 'b'
    - removes unicode symbols
    - replaces contractions
    - removes non-ascii symbols
    - converts to lowercase
    - removes punctuation
    - removes numbers
    """
    new_words = []
    for word in tokenized_text:
        # remove 'b' from tokens
        if word == tokenized_text[0]:
            if len(word) == 1 and word[0] == 'b':
                continue
            elif len(word) > 1 and word[:2] == "b'":
                word = word[2:]
                
        # remove unicode symbols from tokens
        word = re.sub(r"(x[abcdef0-9]{0,2})?", '', word)
        # word = remove_unicode(word)
        
        # replace contractions from tokens
        word = contractions.fix(word)
        # word = replace_contractions(word)
        
        # remove non-ascii from tokens
        word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        # word = remove_non_ascii(word)
        
        # convert word to lowercase
        word = word.lower()
        
        # remove links from token
        word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word)
        # word = remove_links(word)
        
        # remove punctuation if it results in an actual world
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '' or new_word != None:
            word = new_word
            
        # replace numbers with textual representation
        word = replace_numbers(word)
        
        # skip over errant 'n'
        if len(word) == 1 and word[0] == 'n':
            continue
            
        # skip over all stopwords
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words


def remove_unicode(word):
    new_word = re.sub(r"(x[abcdef0-9]{0,2})?", '', word)
    return new_word

def replace_contractions(word):
    new_word = contractions.fix(word)
    return new_word

def remove_non_ascii(word):
    new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_word

def remove_links(word):
    new_word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word)
    return new_word

def remove_punctuation(word):
    new_word = re.sub(r'[^\w\s]', '', word)
    if new_word != '':
        return new_word
    
def replace_numbers(word):
    if word.isdigit():
        p = inflect.engine()
        new_word = p.number_to_words(word)
        return new_word
        new_words.append(new_word)
    else:
        return word 

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_words(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def handle_text(row):
    """
    Tokenize, Stem and Lemma the text.
    Append these values, in that order, to row.values
    """
    text = row['text']
    tokenized = tt.tokenize(text)
    processed = preprocess_text(tokenized)
    stems = stem_words(processed)
    lemmas = lemmatize_words(processed)
#     print("\nTokenized: {}".format(tokenized))
#     print("Stems: {}".format(stems))
#     print("Lemmas: {}\n".format(lemmas))
    return [tokenized, stems, lemmas]
    
def write_to_csv(row):
    with open(".\\csv\\mergedtweets.csv", 'a', encoding = 'utf-8', newline = '') as file:
        writer = csv.writer(file, delimiter = ',')
        writer.writerow(row)

In [94]:
# samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 518, 5901, 9102, 300005, 401509, 567891, 991858, 991500, 991918]

def analyze():
    with open(".\\csv\\mergedtweets.csv", 'a', encoding = 'utf-8', newline = '') as file:
        writer = csv.writer(file, delimiter = ',')
        for index, row in tqdm(df.iterrows()):
            augmented = handle_text(row)
            values = list(row.values)
            values.extend(augmented)
            writer.writerow(values)

analyze()



