In [2]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
df = pd.read_csv("./data/Kickstarter_merged.csv")

In [4]:
# convert 'video' to a binary categorical variable
df['video'].value_counts()
df['has_video'] = df['video'].apply(lambda x: 0 if pd.isnull(x) else 1)
df['has_video'].value_counts()

1    1404
0     728
Name: has_video, dtype: int64

NLP features

In [5]:
# Text cleaning for: rewards, description, description story, description risks


def clean_text(df):
    def process_rewards(corpus):
    
        corpus_processed = []
        for row in corpus:
            row_processed = ""
            row = row.replace("\\n", " ")
            row = ast.literal_eval(row)

            for dict in row:
                row_processed += dict['rewards'].lower() + ' '
            
            
            row_processed = row_processed.replace("//",'')
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed)

        return corpus_processed
    
    def process_description_story(corpus):
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = row.replace("\r", " " )
            row_processed = row_processed.replace("\n", " " )
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed)

        return corpus_processed

    df["rewards_processed"] = process_rewards(df["rewards"])
    df["description_processed"] = process_description_story(df["description"])
    df["description_story_processed"] = process_description_story(df["description_story"])
    df["description_risks_processed"] = process_description_story(df["description_risks"])

    return df


In [8]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [12]:
def generate_nlp_features(df):
    # Rewards

    vect_rewards = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    rewards_processed = pd.Series(df["rewards_processed"])
    tfidf_fit_rewards = vect_rewards.fit(rewards_processed)
    rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()
    df = pd.merge(df, pd.DataFrame(rewards_tfidf_array), left_index=True, right_index=True)


    return df

In [13]:
# test
generate_nlp_features(clean_text(df))



Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,pledged,goal,deadline,...,90,91,92,93,94,95,96,97,98,99
0,1249154571,Bunny Care Clinic Pin and Apparel Collection,A small collection of Bunny themed enamel pins...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We try our best to keep everything on schedule...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/illustration,14115.0,700,2022-10-04 01:25:20,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.134940,0.065067,0.065067,0.00000
1,1276054891,Hustle: A Singaporean Card Game,Hustle: A Singaporean Card Game is a funny and...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We want to be frank and honest with all our ge...,"[{'rewards': 'Pledge without a reward'}, {'rew...",games/tabletop games,6.0,50,2022-10-07 13:08:05,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000
2,236207086,"Neovide, Waterless One-Stop Sous Vide Cooker",No more water containers and vacuum bags. With...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,"With our years of experience with products, ou...","[{'rewards': 'Pledge without a reward'}, {'rew...",technology,289082.0,10000,2022-10-06 13:00:04,...,0.0,0.000000,0.097312,0.0,0.0,0.0,0.135118,0.000000,0.000000,0.00000
3,2128144913,Lit Cafe,Little Toasts of Happiness,StoryHi! \r\nLit Cafe is a small space to prov...,The concept is to offer affordable local food ...,"[{'rewards': 'Pledge without a reward'}, {'rew...",food/spaces,170.0,12000,2022-10-02 13:09:00,...,0.0,0.049346,0.000000,0.0,0.0,0.0,0.058426,0.077474,0.077474,0.00000
4,891970407,Runway Tarot & Golden Journey Tarot,When the fashion week come into Tarot.\r\nThis...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,COLLABORATION\r\nThis is the third time that b...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/painting,33599.0,6800,2022-10-05 13:57:35,...,0.0,0.026532,0.089082,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,1350729336,The Daily Market Bunch. Fresh flowers delivere...,Help me make flowers in Singapore affordable a...,StoryHello!The Daily Market Bunch is going to ...,"In order to offer flowers at these prices, I h...",[{'rewards': 'Pledge S$ 5 or more\n\nAbout US$...,crafts,0.0,1000,2016-11-16 12:26:18,...,0.0,0.185950,0.346852,0.0,0.0,0.0,0.055041,0.072985,0.072985,0.00000
2128,1730172851,HALFOLD - Wallet / Card Keeper (Canceled),"A wallet/card keeper, hand-folded from leather...",Story \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n \r\n\r\...,We feel that the risks for the HALFOLD card ke...,[{'rewards': 'Pledge S$ 40 or more\n\nAbout $2...,design/product design,1606.0,1500,2016-11-10 17:47:27,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.164019,0.054373,0.054373,0.00000
2129,490511999,The Record Apps (virtual karaoke & guitar trac...,Record App Studio features karaoke singing fro...,Story\r\n\r\n\r\nWhy we created this? To inven...,Record App Studio by The RockParody(RP) Design...,[{'rewards': 'Pledge S$ 7 or more\n\nAbout US$...,music/world music,0.0,350000,2016-12-10 09:56:05,...,0.0,0.070801,0.158479,0.0,0.0,0.0,0.069857,0.092632,0.092632,0.73966
2130,909975656,The Countdowner: Love Is More Than Happily Eve...,Reinventing the romance genre where 'Happily E...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nBook 1 Sy...,The three books are almost done. I've finished...,"[{'rewards': ""Pledge S$ 2 or more\n\nAbout $2\...",publishing/fiction,0.0,5000,2016-10-24 08:00:49,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.115615,0.153309,0.153309,0.00000
