In [9]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from textblob import TextBlob
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [5]:
df = pd.read_csv("./data/Kickstarter_merged.csv")

In [3]:
# convert 'video' to a binary categorical variable
df['video'].value_counts()
df['has_video'] = df['video'].apply(lambda x: 0 if pd.isnull(x) else 1)
df['has_video'].value_counts()

1    1404
0     728
Name: has_video, dtype: int64

NLP features

In [17]:
# Text cleaning for: rewards, description, description story, description risks

def clean_text(df):
    def process_rewards(corpus):
    
        corpus_processed = []
        for row in corpus:
            row_processed = ""
            row = row.replace("\\n", " ")
            row = ast.literal_eval(row)

            for dict in row:
                row_processed += dict['rewards'].lower() + ' '
            
            
            row_processed = row_processed.replace("//",'')
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed)

        return corpus_processed
    
    def process_description_story(corpus):
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = row.replace("\r", " " )
            row_processed = row_processed.replace("\n", " " )
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not pd.isnull(row_processed) else "") # handle NA

        return corpus_processed

    df["rewards_processed"] = process_rewards(df["rewards"])
    df["description_processed"] = process_description_story(df["description"])
    df["description_story_processed"] = process_description_story(df["description_story"])
    df["description_risks_processed"] = process_description_story(df["description_risks"])

    return df


In [5]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [6]:
def generate_nlp_features(df):
    
    # Rewards

    vect_rewards = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    rewards_processed = pd.Series(df["rewards_processed"])
    tfidf_fit_rewards = vect_rewards.fit(rewards_processed)
    rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()
    rewards_tfidf_df = pd.DataFrame(rewards_tfidf_array)
    rewards_tfidf_df.columns = list(map(lambda x : "rewards_" + str(x), rewards_tfidf_df.columns))
    df = pd.merge(df, rewards_tfidf_df , left_index=True, right_index=True)
    

    # Description

    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description)
    description_tfidf_array = tfidf_fit_description.transform(description).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)


    # Description Story

    vect_description_story = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_story_processed = pd.Series(df["description_story_processed"])
    tfidf_fit_description_story_processed = vect_description_story.fit(description_story_processed)
    description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(description_story_processed).toarray()
    description_story_tfidf_df = pd.DataFrame(description_story_processed_tfidf_array)
    description_story_tfidf_df.columns = list(map(lambda x : "description_story_" + str(x), description_story_tfidf_df.columns))
    df = pd.merge(df, description_story_tfidf_df , left_index=True, right_index=True)


    # Description Risks

    vect_description_risks = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description_risks_processed = pd.Series(df["description_risks_processed"])
    tfidf_fit_description_risks_processed = vect_description_risks.fit(description_risks_processed)
    description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(description_risks_processed).toarray()
    description_risks_tfidf_df = pd.DataFrame(description_risks_processed_tfidf_array)
    description_risks_tfidf_df.columns = list(map(lambda x : "description_risks_" + str(x), description_risks_tfidf_df.columns))
    df = pd.merge(df, description_risks_tfidf_df , left_index=True, right_index=True)

    return df

In [43]:
# test
df = generate_nlp_features(clean_text(df))



In [44]:
for x in df.columns:
    print(x)

id
name
description
description_story
description_risks
rewards
category
pledged
goal
deadline
location
state
faq_count
update_count
backers_count
is_starrable
spotlight
staff_pick
video
creator_name
creator_url
url
created_at
published_at
launched_at
link
has_video
rewards_processed
description_processed
description_story_processed
description_risks_processed
rewards_0
rewards_1
rewards_2
rewards_3
rewards_4
rewards_5
rewards_6
rewards_7
rewards_8
rewards_9
rewards_10
rewards_11
rewards_12
rewards_13
rewards_14
rewards_15
rewards_16
rewards_17
rewards_18
rewards_19
rewards_20
rewards_21
rewards_22
rewards_23
rewards_24
rewards_25
rewards_26
rewards_27
rewards_28
rewards_29
rewards_30
rewards_31
rewards_32
rewards_33
rewards_34
rewards_35
rewards_36
rewards_37
rewards_38
rewards_39
rewards_40
rewards_41
rewards_42
rewards_43
rewards_44
rewards_45
rewards_46
rewards_47
rewards_48
rewards_49
rewards_50
rewards_51
rewards_52
rewards_53
rewards_54
rewards_55
rewards_56
rewards_57
rewards_5

**Rewards Features**

In [7]:
def create_rewards_tiers(df):
    df["reward_tiers"] = df["rewards"].apply(lambda x : len(ast.literal_eval(x)))
    df = move_reward_tiers(df)
    return df

def create_all_reward_amount(df):
    df["all_reward_amount"] = np.empty((len(df), 0)).tolist()

    for i in range(len(df)):
        all_reward_amount = []
        dict_list = ast.literal_eval(df.iloc[i, 5]) # Converts rewards column into dictionary

        for dict in dict_list:
            values_string = str(dict.values())
            reward_title = re.search(r"Pledge S\$ \d{1,3}(,\d{1,3})? or more", values_string) # Search for all reward titles

            if reward_title is not None:
                reward_amount = re.search(r"\d{1,3}(,\d{1,3})?", reward_title.group()) # Search for only the digits in reward amount
                if reward_amount is not None:
                    all_reward_amount.append(reward_amount.group())
            else:
                all_reward_amount.append(0) # If no reward title is found, add 0
        df["all_reward_amount"][i] = all_reward_amount
    df = move_all_reward_amount(df)
    return df

# Rearange reward_tiers column to the right of rewards
def move_reward_tiers(df):
    cols = df.columns.tolist()
    cols = cols[:6] + [cols[-1]] + cols[6:-1]
    df = df[cols]
    return df

# Rearange all_reward_amount column to the right of reward_tiers
def move_all_reward_amount(df):
    cols = df.columns.tolist()
    cols = cols[:7] + [cols[-1]] + cols[7:-1]
    df = df[cols]
    return df

In [46]:
# Create reward tiers feature
df = create_rewards_tiers(df)
df.head()


Unnamed: 0,id,name,description,description_story,description_risks,rewards,reward_tiers,category,pledged,goal,...,description_risks_90,description_risks_91,description_risks_92,description_risks_93,description_risks_94,description_risks_95,description_risks_96,description_risks_97,description_risks_98,description_risks_99
0,1249154571,Bunny Care Clinic Pin and Apparel Collection,A small collection of Bunny themed enamel pins...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We try our best to keep everything on schedule...,"[{'rewards': 'Pledge without a reward'}, {'rew...",12,art/illustration,14115.0,700,...,0.0,0.296369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1276054891,Hustle: A Singaporean Card Game,Hustle: A Singaporean Card Game is a funny and...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We want to be frank and honest with all our ge...,"[{'rewards': 'Pledge without a reward'}, {'rew...",2,games/tabletop games,6.0,50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140887,0.0
2,236207086,"Neovide, Waterless One-Stop Sous Vide Cooker",No more water containers and vacuum bags. With...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,"With our years of experience with products, ou...","[{'rewards': 'Pledge without a reward'}, {'rew...",8,technology,289082.0,10000,...,0.140606,0.0,0.142717,0.0,0.137243,0.0,0.0,0.0,0.0,0.133257
3,2128144913,Lit Cafe,Little Toasts of Happiness,StoryHi! \r\nLit Cafe is a small space to prov...,The concept is to offer affordable local food ...,"[{'rewards': 'Pledge without a reward'}, {'rew...",7,food/spaces,170.0,12000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,891970407,Runway Tarot & Golden Journey Tarot,When the fashion week come into Tarot.\r\nThis...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,COLLABORATION\r\nThis is the third time that b...,"[{'rewards': 'Pledge without a reward'}, {'rew...",15,art/painting,33599.0,6800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35206,0.0


In [8]:
# Create all reward amount feature
# If lowest reward amount is not 0, the project is either already fully funded or cancelled.
# Rewards should be sorted in ascending order, any amount to the right and less than the max means reward is no longer available.
df = create_all_reward_amount(df)
df.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,all_reward_amount,pledged,goal,...,staff_pick,video,creator_name,creator_url,url,created_at,published_at,launched_at,link,has_video
0,1249154571,Bunny Care Clinic Pin and Apparel Collection,A small collection of Bunny themed enamel pins...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHello! My...,We try our best to keep everything on schedule...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/illustration,"[0, 12, 14, 20, 25, 32, 35, 78, 80, 85, 155, 130]",14115.0,700,...,0,,Labutori,https://www.kickstarter.com/profile/labutori,https://www.kickstarter.com/projects/labutori/...,2022-05-25 03:28:55+00:00,2022-09-09 01:25:20+00:00,2022-09-09 01:25:20+00:00,https://www.kickstarter.com/projects/labutori/...,0
1,1276054891,Hustle: A Singaporean Card Game,Hustle: A Singaporean Card Game is a funny and...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,We want to be frank and honest with all our ge...,"[{'rewards': 'Pledge without a reward'}, {'rew...",games/tabletop games,"[0, 2]",6.0,50,...,0,https://v2.kickstarter.com/1662721387-XBw1i2Sj...,Hustle Singapore,https://www.kickstarter.com/profile/hustlesg,https://www.kickstarter.com/projects/hustlesg/...,2022-08-20 09:52:01+00:00,2022-09-07 13:08:05+00:00,2022-09-07 13:08:05+00:00,https://www.kickstarter.com/projects/hustlesg/...,1
2,236207086,"Neovide, Waterless One-Stop Sous Vide Cooker",No more water containers and vacuum bags. With...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSous vide...,"With our years of experience with products, ou...","[{'rewards': 'Pledge without a reward'}, {'rew...",technology,"[0, 2, 407, 505, 814, 293, 350, 463]",289082.0,10000,...,1,https://v2.kickstarter.com/1662723490-gvkAMr9s...,The Space Tech,https://www.kickstarter.com/profile/thespacetech,https://www.kickstarter.com/projects/thespacet...,2022-06-30 09:28:52+00:00,2022-09-06 13:00:04+00:00,2022-09-06 13:00:04+00:00,https://www.kickstarter.com/projects/thespacet...,1
3,2128144913,Lit Cafe,Little Toasts of Happiness,StoryHi! \nLit Cafe is a small space to provid...,The concept is to offer affordable local food ...,"[{'rewards': 'Pledge without a reward'}, {'rew...",food/spaces,"[0, 7, 12, 30, 38, 108, 168]",170.0,12000,...,0,,Kay En,https://www.kickstarter.com/profile/hosum,https://www.kickstarter.com/projects/hosum/ho-...,2022-08-30 08:28:52+00:00,2022-09-06 04:29:02+00:00,2022-09-06 04:29:02+00:00,https://www.kickstarter.com/projects/hosum/ho-...,0
4,891970407,Runway Tarot & Golden Journey Tarot,When the fashion week come into Tarot.\nThis p...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,COLLABORATION\nThis is the third time that bot...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/painting,"[0, 2, 80, 85, 90, 94, 99, 150, 160, 165, 165,...",33599.0,6800,...,0,https://v2.kickstarter.com/1662720057-aeWt13h6...,Eugene Leong,https://www.kickstarter.com/profile/locationtarot,https://www.kickstarter.com/projects/locationt...,2022-07-18 14:48:29+00:00,2022-09-05 13:57:35+00:00,2022-09-05 13:57:35+00:00,https://www.kickstarter.com/projects/locationt...,1


In [None]:
'''
Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].
'''
def generate_sentiment_features():
    df = df.dropna(subset=['description_story_processed', 'description_risks_processed', 'description_proocessed', 'rewards_processed']) # NOTE: put at top with other dropnas from other features?
    df["description_story_polarity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_story_subjectivity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_polarity"] = df["description_proocessed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_subjectivity"] = df["description_proocessed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_risks_polarity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_risks_subjectivity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["rewards_polarity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["rewards_subjectivity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)           

In [None]:
def generate word_count_features(): # omitted description due to word limit, word count likely similar for all projects
    df['description_story_word_count'] = df["description_story_processed"].apply(lambda x: len(str(x).split(" ")))
    df['description_risks_word_count'] = df["description_risks_processed"].apply(lambda x: len(str(x).split(" ")))
    df['rewards_word_count'] = df["rewards_processed"].apply(lambda x: len(str(x).split(" ")))

hold out on topic modelling first because it is unsupervised algorithm

In [30]:
# df = clean_text(df)
# #generate BOW on description_story 
# #Our spaCy model:
# nlp = en_core_web_md.load()
# # Tags I want to remove from the text
# removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
# tokens = []

# for story in nlp.pipe(df['description_story_processed']):
#    proj_tok = [token.lemma_.lower() for token in story if token.pos_ not in removal and not token.is_stop and token.is_alpha]
#    tokens.append(proj_tok)
    
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# # No_below: Tokens that appear in less than 5 documents are filtered out.
# # No_above: Tokens that appear in more than 50% of the total corpus are also removed as default.
# # Keep_n: We limit ourselves to the top 1000 most frequent tokens (default is 100.000). Set to ‘None’ if you want to keep all.

# df['story_tokens'] = tokens
# print(len(df))
# print(len(tokens))

# df['story_tokens']
# dictionary = Dictionary(df['story_tokens'])

# corpus = [dictionary.doc2bow(doc) for doc in df['story_tokens']]

# lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

## Combine all feature generating functions

In [None]:
result = clean_text(df)
