In [94]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [95]:
df = pd.read_csv("./data/Kickstarter_merged.csv")
df.head(3)
df_copy = df.copy()

In [96]:
df_copy.describe()

Unnamed: 0,id,pledged,goal,update_count,backers_count,is_starrable,spotlight,staff_pick
count,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0
mean,1077279000.0,13784.319747,66046.2,5.312383,119.403846,0.009381,0.432458,0.045966
std,622544700.0,41074.680798,2166310.0,8.216737,547.130104,0.096422,0.495533,0.209461
min,1454620.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,537066300.0,147.75,1200.0,0.0,4.0,0.0,0.0,0.0
50%,1105835000.0,1594.21,5000.0,2.0,23.0,0.0,0.0,0.0
75%,1622623000.0,9547.25,15000.0,8.0,88.0,0.0,1.0,0.0
max,2145266000.0,515413.0,100000000.0,67.0,18768.0,1.0,1.0,1.0


In [97]:
df_copy.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,pledged,goal,deadline,...,spotlight,staff_pick,video,creator_name,creator_url,url,created_at,published_at,launched_at,link
0,1249154571,Bunny Care Clinic Pin and Apparel Collection,A small collection of Bunny themed enamel pins...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We try our best to keep everything on schedule...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/illustration,14115.0,700,2022-10-04 01:25:20,...,0,0,,Labutori,https://www.kickstarter.com/profile/labutori,https://www.kickstarter.com/projects/labutori/...,2022-05-25 03:28:55+00:00,2022-09-09 01:25:20+00:00,2022-09-09 01:25:20+00:00,https://www.kickstarter.com/projects/labutori/...
1,1276054891,Hustle: A Singaporean Card Game,Hustle: A Singaporean Card Game is a funny and...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We want to be frank and honest with all our ge...,"[{'rewards': 'Pledge without a reward'}, {'rew...",games/tabletop games,6.0,50,2022-10-07 13:08:05,...,0,0,https://v2.kickstarter.com/1662721387-XBw1i2Sj...,Hustle Singapore,https://www.kickstarter.com/profile/hustlesg,https://www.kickstarter.com/projects/hustlesg/...,2022-08-20 09:52:01+00:00,2022-09-07 13:08:05+00:00,2022-09-07 13:08:05+00:00,https://www.kickstarter.com/projects/hustlesg/...
2,236207086,"Neovide, Waterless One-Stop Sous Vide Cooker",No more water containers and vacuum bags. With...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,"With our years of experience with products, ou...","[{'rewards': 'Pledge without a reward'}, {'rew...",technology,289082.0,10000,2022-10-06 13:00:04,...,0,1,https://v2.kickstarter.com/1662723490-gvkAMr9s...,The Space Tech,https://www.kickstarter.com/profile/thespacetech,https://www.kickstarter.com/projects/thespacet...,2022-06-30 09:28:52+00:00,2022-09-06 13:00:04+00:00,2022-09-06 13:00:04+00:00,https://www.kickstarter.com/projects/thespacet...
3,2128144913,Lit Cafe,Little Toasts of Happiness,StoryHi! \r\nLit Cafe is a small space to prov...,The concept is to offer affordable local food ...,"[{'rewards': 'Pledge without a reward'}, {'rew...",food/spaces,170.0,12000,2022-10-02 13:09:00,...,0,0,,Kay En,https://www.kickstarter.com/profile/hosum,https://www.kickstarter.com/projects/hosum/ho-...,2022-08-30 08:28:52+00:00,2022-09-06 04:29:02+00:00,2022-09-06 04:29:02+00:00,https://www.kickstarter.com/projects/hosum/ho-...
4,891970407,Runway Tarot & Golden Journey Tarot,When the fashion week come into Tarot.\r\nThis...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,COLLABORATION\r\nThis is the third time that b...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/painting,33599.0,6800,2022-10-05 13:57:35,...,0,0,https://v2.kickstarter.com/1662720057-aeWt13h6...,Eugene Leong,https://www.kickstarter.com/profile/locationtarot,https://www.kickstarter.com/projects/locationt...,2022-07-18 14:48:29+00:00,2022-09-05 13:57:35+00:00,2022-09-05 13:57:35+00:00,https://www.kickstarter.com/projects/locationt...


In [98]:
# process rewards string
def process_rewards(corpus):
    corpus_processed = []
    for row in corpus:
        row_processed = ""
        row = row.replace("\\n", " ")
        row = ast.literal_eval(row)

        for dict in row:
            row_processed += dict['rewards'].lower() + ' '
        
        row_processed = row_processed.replace("//",'')
        corpus_processed.append(row_processed)

    return corpus_processed

df_copy["rewards_processed"] = process_rewards(df_copy["rewards"])


In [99]:
df_copy["rewards_processed"]

0       pledge without a reward pledge s$ 12 or more  ...
1       pledge without a reward pledge s$ 2 or more  a...
2       pledge without a reward pledge s$ 2 or more  a...
3       pledge without a reward pledge s$ 7 or more  a...
4       pledge without a reward pledge s$ 2 or more  a...
                              ...                        
2127    pledge s$ 5 or more  about us$ 4    a thank yo...
2128    pledge s$ 40 or more  about $29    handmade by...
2129    pledge s$ 7 or more  about us$ 6    city rawke...
2130    pledge s$ 2 or more  about $2    every dollar ...
2131    pledge s$ 50 or more  about us$ 36    the onli...
Name: rewards_processed, Length: 2132, dtype: object

In [100]:
df_copy["description"]

0       A small collection of Bunny themed enamel pins...
1       Hustle: A Singaporean Card Game is a funny and...
2       No more water containers and vacuum bags. With...
3                              Little Toasts of Happiness
4       When the fashion week come into Tarot.\r\nThis...
                              ...                        
2127    Help me make flowers in Singapore affordable a...
2128    A wallet/card keeper, hand-folded from leather...
2129    Record App Studio features karaoke singing fro...
2130    Reinventing the romance genre where 'Happily E...
2131    The Online Deals is a location based discount ...
Name: description, Length: 2132, dtype: object

In [101]:
# process description story
def process_description_story(corpus):
    corpus_processed = []
    for row in corpus:
        row = str(row)
        row_processed = row.replace("\r", " " )
        row_processed = row_processed.replace("\n", " " )
        corpus_processed.append(row_processed)

    return corpus_processed

df_copy["description_story_processed"] = process_description_story(df_copy["description_story"])
df_copy["description_risks_processed"] = process_description_story(df_copy["description_risks"])

In [102]:
df_copy["description_story_processed"][1]

"Story                                                                                                                                                                Support Us!  Back HUSTLE! and be one of the very first to own an exclusive digital version of a Singaporean card game about the hustle culture.\xa0  Music credits: Bensound Royalty Free Music  The Backstory  Hi there, my name is Ah Long, and I’m the lead creator of HUSTLE!  I would deeply appreciate your help and support to get this passion project started. Your contributions will go a long way toward developing the local board game community, alongside helping me to improve the product's artwork, quality and potential physical launch.\xa0  Do note that due to logistical challenges, we are currently providing backers an e-copy of the game for you to print, cut out and play on your own. We hope to be able to produce physical copies down the road.  •The concept of 'Hustle: A Singaporean Card Game' arose when I was considerin

In [103]:
df_copy["description_risks_processed"][1]

'We want to be frank and honest with all our generous backers so that you get the most genuine experience with our product.      FUNDING   Kickstarter has an "All-or-Nothing" policy where the campaign gets $0 funding if the campaign doesn\'t reach its target. It would mean so much to the HUSTLE team if you could help to spread and share the word about this game so it becomes a reality for you (and for us).  The HUSTLE team genuinely believes in this game being able to start new conversations about what it means to be financially independent in Singapore in this day and age. The risk we are taking with Kickstarter is that if we are unable to be fully funded, we are not able to get the funding needed.'

In [104]:
X, y = df_copy[['id', 'name', 'description', 'description_story', 'description_risks',
       'rewards', 'category', 'pledged', 'goal', 'deadline', 'location',
       'state', 'faq_count', 'update_count', 'backers_count', 'is_starrable',
       'spotlight', 'staff_pick', 'video', 'creator_name', 'creator_url',
       'url', 'created_at', 'published_at', 'launched_at', 'link', 'rewards_processed', "description_story_processed", "description_risks_processed"]], df_copy[["state"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022, shuffle=True)

In [105]:
print(X_train.shape, X_test.shape)

(1492, 29) (640, 29)


Pipeline:

In [106]:
# tokenize, one hot encoding, tfidf vector

# CountVectorizer + TfidfTransformer
vect = TfidfVectorizer( 
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=2, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )

## X
# tfidf

# rewards
rewards_processed = pd.Series(X_train["rewards_processed"])
tfidf_fit_rewards = vect.fit(rewards_processed)
rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()

# description
description = pd.Series(X_train["description"])
tfidf_fit_description = vect.fit(description)
description_tfidf_array = tfidf_fit_description.transform(description).toarray()

# description story
description_story_processed = pd.Series(X_train["description_story_processed"])
tfidf_fit_description_story_processed = vect.fit(description_story_processed)
description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(description_story_processed).toarray()

# description risks
description_risks_processed = pd.Series(X_train["description_risks_processed"])
tfidf_fit_description_risks_processed = vect.fit(description_risks_processed)
description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(description_risks_processed).toarray()

## y
# one hot encoding for `state`
one_hot_fit_state = OneHotEncoder().fit(y_train)
one_hot_state = one_hot_fit_state.transform(y_train).toarray()




In [107]:
print(rewards_tfidf_array.shape)
rewards_tfidf_array

(1492, 100)


array([[0.05011934, 0.12703754, 0.        , ..., 0.09768141, 0.04017431,
        0.        ],
       [0.20505155, 0.        , 0.50049325, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.45397432, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.04905705, 0.        , 0.02993482, ..., 0.02048808, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09096673, 0.        ,
        0.11809267],
       [0.01748569, 0.0221605 , 0.        , ..., 0.02921074, 0.        ,
        0.01896064]])

In [108]:
print(description_tfidf_array.shape)
description_tfidf_array

(1492, 100)


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.67961909, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [109]:
print(description_story_processed_tfidf_array.shape)
description_story_processed_tfidf_array

(1492, 100)


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06163125, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.11540745,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.02486179,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [110]:
print(description_risks_processed_tfidf_array.shape)
description_risks_processed_tfidf_array

(1492, 100)


array([[0.        , 0.        , 0.14317997, ..., 0.        , 0.        ,
        0.1614124 ],
       [0.        , 0.3417851 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08620492, 0.        ,
        0.        ],
       [0.        , 0.2040057 , 0.        , ..., 0.38557564, 0.        ,
        0.23995107]])

In [111]:
one_hot_state

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [112]:
# Count vectorizer

# tokenize, one hot encoding, count vector
vect = CountVectorizer( 
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=2, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )
count_fit = vect.fit(pd.Series(X_train["rewards_processed"]))
rewards_count_array = tfidf_fit.transform(pd.Series(X_train["rewards_processed"])).toarray()

**Ignore below stuff**

In [113]:
#tokenize
tokenizer = TreebankWordTokenizer()
X_train["rewards_tokens"] = X_train["rewards_processed"].apply(lambda x : tokenizer.tokenize(x))

# remove stopwords

stop = stopwords.words('english')
custom_stopwords = ['$']
for word in custom_stopwords:
    stop.append(word)
    
X_train["rewards_tokens"] = X_train["rewards_tokens"].apply(lambda x : [word for word in x if word not in stop])


In [114]:
# POS tagging
from nltk.corpus import wordnet

def get_part_of_speech_tags(token):
    
    """Maps POS tags to first character of the pos tagging returned by pos_tag.
    We are focussing on Verbs, Nouns, Adjectives and Adverbs; otherwise return NOUN"""

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    tag = nltk.pos_tag([token])[0][1][0].upper()
    
    return tag_dict.get(tag, wordnet.NOUN)

# lemmatize
lemmatizer = WordNetLemmatizer()
X_train["rewards_tokens_lemmatized_POS"] = X_train["rewards_tokens"].apply(lambda x : [lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in x])


In [115]:
# n-gram
X_train["rewards_tokens_lemmatized_POS_bigrams"] = X_train["rewards_tokens_lemmatized_POS"].apply(lambda x : list(ngrams(x, 2)))
X_train["rewards_tokens_lemmatized_POS_trigrams"] = X_train["rewards_tokens_lemmatized_POS"].apply(lambda x : list(ngrams(x, 3)))


In [116]:
# #stemming
# ps = PorterStemmer()
# X_train["rewards_tokens"] = X_train["rewards_tokens"].apply(lambda x : [ps.stem(word) for word in x])

In [117]:
rewards_tfidf_array

array([[0.05011934, 0.12703754, 0.        , ..., 0.09768141, 0.04017431,
        0.        ],
       [0.20505155, 0.        , 0.50049325, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.45397432, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.04905705, 0.        , 0.02993482, ..., 0.02048808, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09096673, 0.        ,
        0.11809267],
       [0.01748569, 0.0221605 , 0.        , ..., 0.02921074, 0.        ,
        0.01896064]])