In [1]:
import pandas as pd
import re
import string

import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import spacy

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
nlp = spacy.load('en_core_web_sm')

In [2]:
df = pd.read_pickle('tesla.pkl')

In [3]:
df['Discussion'][0]

'Tesla’s forum provides an online meeting space for owners and enthusiasts to exchange ideas that are entertaining, helpful and useful. We encourage you to participate and only ask that you be respectful of others. Don’t post messages that are obscene, vulgar, hateful, sexual in nature, infringe on the proprietary rights of others, or impersonate or misrepresent yourself or other individuals, including Tesla employees. Only post material which you own or for which you have received a copyright license. Whatever you post, we reserve the right to copy and use. We also reserve the right to edit or delete your post as well as suspend your account.All posts represent the author’s views and not of Tesla’s.'

In [4]:
def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub('\n', '', text)
    
    # Change out the odd apostrophe
    text = re.sub("’", "'", text)

    # Remove puncuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im',]

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub('\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)
    
    final_string = re.sub(' +', ' ', final_string)

    return final_string

In [5]:
df['Discussion_Clean'] = df['Discussion'].apply(lambda x: clean_string(x, stem='Spacy'))

In [6]:
df['Discussion_Clean'][0]

'teslas forum provide online meeting space owner enthusiast exchange idea entertain helpful useful encourage participate ask respectful other do not post message obscene vulgar hateful sexual nature infringe proprietary right other impersonate misrepresent individual include tesla employee post material receive copyright license whatever post reserve right copy use also reserve right edit delete post well suspend accountall post represent author view tesla'

In [7]:
df['Discussion_Clean'][1]

'order tesla last night still receive confirmation'

In [8]:
df['Discussion_Clean'][2]

'situation email get confirmation soon hopefully reservation also receive sure busy many new reservation even check credit card company charge patient'

In [9]:
df['Discussion_Clean'][3]

'ignore'

In [10]:
df.to_pickle('tesla_clean.pkl')