In [None]:
import nltk
import pandas as pd
import numpy as np

import re

from tqdm import tqdm

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    #string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub('[!@#$\\\\,\'()]', '', string)
    string = re.sub(' +', ' ', string)
    return string.strip().lower()

In [None]:
corpus_path = 'kickstarter_corpus.csv'
df = pd.read_csv(corpus_path)

In [None]:
df.head()

In [None]:
for index, row in tqdm(df.iterrows()):
    words = clean_str('{} {} {}'.format(row['title'],row['blurb'],row['full_text'])).split()
    df.loc[index,'cleaned_words'] = ' '.join(words)

In [None]:
from nltk.corpus import stopwords

porter=nltk.PorterStemmer()
for index, row in tqdm(df.iterrows()):
    words = row['cleaned_words'].split()
    stemmed_words = [porter.stem(word) for word in words if not pd.isnull(word) and word not in stopwords.words("english")]
    df.loc[index,'stemmed_words'] = ' '.join(stemmed_words)

In [None]:
df.head()

In [None]:
print df.columns 
for col_name in df.columns:
    print '{}: {}'.format(col_name, sum(df[col_name].isnull()))

In [None]:
import csv 

df.to_csv('kickstarter_corpus_cleaned.csv', index=False, escapechar="\\", quoting=csv.QUOTE_NONNUMERIC)