In [126]:
import os
import re
import pandas as pd
import contractions

from glob import glob
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.phrases import Phraser, Phrases

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [116]:
model = KeyedVectors.load_word2vec_format('ycc_dataset_lg/w2vec_model/model.bin', binary=False)
model.most_similar_cosmul('screen')

[('dash', 0.8780766725540161),
 ('display', 0.8719692826271057),
 ('dashboard', 0.8458210229873657),
 ('button', 0.8425780534744263),
 ('infotainment', 0.8397122025489807),
 ('camera', 0.8348522782325745),
 ('touch_screen', 0.8328131437301636),
 ('phone', 0.8215099573135376),
 ('tablet', 0.808016836643219),
 ('touchscreen', 0.8049045205116272)]

In [117]:
def df_label_corpusid(file_name):
    """Create row labels for each new data file"""
    file_name = re.sub('downloads/', '', str(file_name))
    file_name = re.sub('.json', '', file_name)
    search_query, video_id = file_name.split('/')
    return search_query, video_id

corpus_large = []
for file in glob('downloads/cartrends/*.json'):
    corpus = pd.read_json(file, encoding='utf-8', lines=True)
    search_query, video_id = df_label_corpusid(file)
    corpus['search_query'] = search_query
    corpus['video_id'] = video_id
    corpus_large.append(corpus)

corpus = pd.concat(corpus_large, ignore_index=True)
corpus = pd.DataFrame(corpus)
corpus.text.head()

0    Happy New Year!\n\n⬇️Scotty’s Top DIY Tools:\r...
1    Hi Scotty I recently bought a 95 Camry automat...
2    I really dislike those new grills as well.  Ha...
3                             HAPPY NEW YEAR SCOTTY!!﻿
4    Scotty Kilmer what do u think of a 2003 Chevy ...
Name: text, dtype: object

In [118]:
import string
from itertools import groupby
from nltk.stem.wordnet import WordNetLemmatizer

def trim_whitespaces(text):
    try:
        text = " ".join(text.split())
    except:
        pass
    return text

def remove_duplicate_words(word):
    # remove punctuation
    word_map = word.maketrans(dict.fromkeys(string.punctuation))
    word_clean = word.translate(word_map)
    # put list back together into a sentence
    return ' '.join([k for k, v in groupby(word_clean.split())])

def reduce_words_with_repeated_chars(text):
    findings = re.findall(r'(\w)\1{2,}', text)
    for char in findings:
        find = char + '{3,}'
        #replace = char + '\1' + '???'
        replace = '???' + char + '???'
        text = re.sub(find, repr(replace), text)

    def remove_excessive_spaces(text):
        # remove more than one space
        text = re.sub(r'(\s)\1{1,}', ' ', text)
        text = text.strip()
        return text
    # now we can remove the placeholders    
    text = text.replace('\'???','')
    text = text.replace('???\'','')
    text = remove_excessive_spaces(text)
    return text

def standardize_text(df, header_name):
    df[header_name] = df[header_name].str.replace(r"http\S+", "")
    df[header_name] = df[header_name].str.replace(r"http", "")
    df[header_name] = df[header_name].str.replace(r"@\S+", "")
    df[header_name] = df[header_name].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[header_name] = df[header_name].str.replace(r"@", "at")
    df[header_name] = df[header_name].str.lower()
    return df

def lemmatize_text(text_corpus, stopwords):
    WordNet = WordNetLemmatizer()
    if stopwords:
        return ' '.join(
            [WordNet.lemmatize(w) for w in text_corpus if w not in stopwords])
    else:
        return ' '.join([WordNet.lemmatize(w) for w in text_corpus])

def normalize_text(df, header_name, stopwords=False):
    if stopwords:
        # adds 20 most uncommon|common words to stopword list
        stop_words = build_stopwords(df, header_name)
    else:
        pass
    df[header_name] = df[header_name].str.replace(r"[^a-zA-Z]", " ")
    df[header_name] = df[header_name].str.replace(r"&lt;/?.*?&gt;", " &lt;&gt; ")
    df[header_name] = df[header_name].str.replace(r"(\\d|\\W)+", " ")
    df[header_name] = df[header_name].str.split()
    df[header_name] = df[header_name].apply(lambda x: lemmatize_text(x, stopwords=stop_words))
    return df

In [119]:
def replace_contractions(df, header_name):
    # first trim any whitespacing before applying contractions
    df[header_name] = df[header_name].apply(lambda x: trim_whitespaces(x))
    df[header_name] = df[header_name].apply(lambda x: contractions.fix(x))
    return df
# first step before removing contraction words
corpus['text'] = corpus['text'].apply(lambda x: remove_whitespace(x))
# removing contractions after striping extra white spaces works really well! TESTED!
corpus['text'] = corpus['text'].apply(lambda x: contractions.fix(x))


# standardize text
corpus = standardize_text(corpus, 'text')
# normalize and lemmatize text
corpus = normalize_text(corpus, 'text')
# remove duplicates from corpus
corpus['text'] = corpus['text'].apply(lambda x: remove_duplicate_words(x))
# reduce repeated chars for text in strings
corpus['text'] = corpus['text'].apply(lambda x: reduce_words_with_repeated_chars(x))
# save test NOTE: TESTED!! IT WORKED AS PLANNED
corpus.to_csv('final_test.csv')

In [90]:
import string
from itertools import groupby

def remove_duplicate_words(word):
    """
    This function removes punctuation! and then
    Removes any repeted word in sequence.
    Use this after preprocessing text
    EXAMPLE:
        >>> 0: 'hey! you are wrong very wrong! wrong!'
        >>> df['text'].apply(lambda x: remove_duplicate_words(x))
        >>> 0: 'hey you are wrong very wrong'
    RETURNS:
        Strings with no repeated words in sequence
    """
    # remove punctuation
    word_map = word.maketrans(dict.fromkeys(string.punctuation))
    word_clean = word.translate(word_map)
    # put list back together into a sentence
    return ' '.join([k for k, v in groupby(word_clean.split())])

# NOTE: This function does the job of removing duplicate words
#corpus['no_duplicates'] = corpus['text'].apply(lambda x: remove_duplicate_words(x))

>- Before removing duplicate words `HEY! THA'S IT you are WRONG VERY WRONG! WRONG! ABOUT (BMW)`

>- After removing duplicate words `HEY THAS IT you are WRONG VERY WRONG ABOUT BMW`

**NOTE**

The words must be match exactly from one another `word_a == word_b`. It only removes words in sequence, I should use this function after preprocessing:
    
- removing contradictions
    
- punctuaction, lowercase, removing(tags, characters, and digits)
    
- lemmatization

- remove duplicate words at this step

In [106]:
def reduce_words_with_repeated_chars(text):
    """Original Code from Aurana
    Reduces words with chars repeated more than 3 times to a single char. 
    Useful to replace words such as loooooooong by long. Be careful, 
    as it can change abreviations such as AAA to single A

    USAGE
    -----
    df['text'] = df['text'].apply(
            lambda x: reduce_words_with_repeated_chars(x))
    """
    findings = re.findall(r'(\w)\1{2,}', text)
    for char in findings:
        find = char + '{3,}'
        #replace = char + '\1' + '???'
        replace = '???' + char + '???'
        text = re.sub(find, repr(replace), text)

    def remove_excessive_spaces(text):
        # remove more than one space
        text = re.sub(r'(\s)\1{1,}', ' ', text)
        # remove spaces in the beginning and in 
        # the end of the string
        text = text.strip()
        return text

    # Now we can remove the placeholders    
    text = text.replace('\'???','')
    text = text.replace('???\'','')
    text = remove_excessive_spaces(text)
    return text

In [None]:
def lemmatize_text(text_corpus):
    WordNet = WordNetLemmatizer()
    return ' '.join([WordNet.lemmatize(word) for word in text_corpus])

def normalize_text(df, header_name):
    """Does the following in order
    * removes punctuations
    * converts to lowercase
    * removes tags
    * remove special characters & digits
    * convert to list from string
    * lemmatizes text
    """
    df[header_name] = df[header_name].str.replace(r"[^a-zA-Z]", " ")
    df[header_name] = df[header_name].str.replace(r"&lt;/?.*?&gt;", " &lt;&gt; ")
    df[header_name] = df[header_name].str.replace(r"(\\d|\\W)+", " ")
    df[header_name] = df[header_name].str.split()
    df[header_name] = df[header_name].apply(lambda x: lemmatize_text(x))
    return df