In [1]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from textblob import TextBlob
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

  from scipy.linalg.special_matrices import triu
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ivankoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ivankoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ivankoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ivankoh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_csv("./data/kickstarter_train.csv")

In [3]:
# convert 'video' to a binary categorical variable
df['video'].value_counts()
df['has_video'] = df['video'].apply(lambda x: 0 if pd.isnull(x) else 1)
df['has_video'].value_counts()

1    1015
0     542
Name: has_video, dtype: int64

NLP features

In [4]:
# Text cleaning for: rewards, description, description story, description risks

def clean_text(df):
    def process_rewards(corpus):
    
        corpus_processed = []
        for row in corpus:
            row_processed = ""
            row = row.replace("\\n", " ")
            row = ast.literal_eval(row)

            for dict in row:
                row_processed += dict['rewards'].lower() + ' '
            
            
            row_processed = row_processed.replace("//",'')
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed)

        return corpus_processed
    
    def process_description_story(corpus):
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = row.replace("\r", " " )
            row_processed = row_processed.replace("\n", " " )
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not pd.isnull(row_processed) else "") # handle NA

        return corpus_processed

    df["rewards_processed"] = process_rewards(df["rewards"])
    df["description_processed"] = process_description_story(df["description"])
    df["description_story_processed"] = process_description_story(df["description_story"])
    df["description_risks_processed"] = process_description_story(df["description_risks"])

    return df


In [5]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [6]:
def generate_nlp_features(df):
    
    # Rewards

    vect_rewards = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    rewards_processed = pd.Series(df["rewards_processed"])
    tfidf_fit_rewards = vect_rewards.fit(rewards_processed)
    rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()
    rewards_tfidf_df = pd.DataFrame(rewards_tfidf_array)
    rewards_tfidf_df.columns = list(map(lambda x : "rewards_" + str(x), rewards_tfidf_df.columns))
    df = pd.merge(df, rewards_tfidf_df , left_index=True, right_index=True)
    

    # Description

    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description)
    description_tfidf_array = tfidf_fit_description.transform(description).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)


    # Description Story

    vect_description_story = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_story_processed = pd.Series(df["description_story_processed"])
    tfidf_fit_description_story_processed = vect_description_story.fit(description_story_processed)
    description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(description_story_processed).toarray()
    description_story_tfidf_df = pd.DataFrame(description_story_processed_tfidf_array)
    description_story_tfidf_df.columns = list(map(lambda x : "description_story_" + str(x), description_story_tfidf_df.columns))
    df = pd.merge(df, description_story_tfidf_df , left_index=True, right_index=True)


    # Description Risks

    vect_description_risks = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description_risks_processed = pd.Series(df["description_risks_processed"])
    tfidf_fit_description_risks_processed = vect_description_risks.fit(description_risks_processed)
    description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(description_risks_processed).toarray()
    description_risks_tfidf_df = pd.DataFrame(description_risks_processed_tfidf_array)
    description_risks_tfidf_df.columns = list(map(lambda x : "description_risks_" + str(x), description_risks_tfidf_df.columns))
    df = pd.merge(df, description_risks_tfidf_df , left_index=True, right_index=True)

    return df

In [7]:
# test
#df = generate_nlp_features(clean_text(df))

In [8]:
for x in df.columns:
    print(x)

id
name
description
description_story
description_risks
rewards
category
pledged
goal
deadline
location
state
faq_count
update_count
backers_count
spotlight
staff_pick
video
launched_at
has_video


**Rewards Features**

In [9]:
def create_rewards_tiers(df):
    df["reward_tiers"] = df["rewards"].apply(lambda x : len(ast.literal_eval(x)))
    df = move_reward_tiers(df)
    return df

def create_all_reward_amount(df):
    df["all_reward_amount"] = np.empty((len(df), 0)).tolist()

    for i in range(len(df)):
        all_reward_amount = []
        dict_list = ast.literal_eval(df.iloc[i, 5]) # Converts rewards column into dictionary

        for dict in dict_list:
            values_string = str(dict.values())
            reward_title = re.search(r"Pledge S\$ \d{1,3}(,\d{1,3})? or more", values_string) # Search for all reward titles

            if reward_title is not None:
                reward_amount = re.search(r"\d{1,3}(,\d{1,3})?", reward_title.group()) # Search for only the digits in reward amount
                if reward_amount is not None:
                    all_reward_amount.append(reward_amount.group())
            else:
                all_reward_amount.append(0) # If no reward title is found, add 0
        df["all_reward_amount"][i] = all_reward_amount
    df = move_all_reward_amount(df)
    return df

# Rearange reward_tiers column to the right of rewards
def move_reward_tiers(df):
    cols = df.columns.tolist()
    cols = cols[:6] + [cols[-1]] + cols[6:-1]
    df = df[cols]
    return df

# Rearange all_reward_amount column to the right of reward_tiers
def move_all_reward_amount(df):
    cols = df.columns.tolist()
    cols = cols[:7] + [cols[-1]] + cols[7:-1]
    df = df[cols]
    return df

In [10]:
# Create reward tiers feature
# df = create_rewards_tiers(df)
# df.head()


In [11]:
# Create all reward amount feature
# If lowest reward amount is not 0, the project is either already fully funded or cancelled.
# Rewards should be sorted in ascending order, any amount to the right and less than the max means reward is no longer available.
# df = create_all_reward_amount(df)
# df.head()

In [12]:
'''
Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].
'''
def generate_sentiment_features(df):
    df = df.dropna(subset=['description_story_processed', 'description_risks_processed', 'description_processed', 'rewards_processed']) # NOTE: put at top with other dropnas from other features?
    df["description_story_polarity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_story_subjectivity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_polarity"] = df["description_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_subjectivity"] = df["description_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_risks_polarity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_risks_subjectivity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["rewards_polarity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["rewards_subjectivity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)          
    return df

In [13]:
def generate_word_count_features(df): # omitted description due to word limit, word count likely similar for all projects
    df['description_story_word_count'] = df["description_story_processed"].apply(lambda x: len(str(x).split(" ")))
    df['description_risks_word_count'] = df["description_risks_processed"].apply(lambda x: len(str(x).split(" ")))
    df['rewards_word_count'] = df["rewards_processed"].apply(lambda x: len(str(x).split(" ")))
    return df

when to drop one of the OHE columns: 
https://stats.stackexchange.com/questions/231285/dropping-one-of-the-columns-when-using-one-hot-encoding

In [21]:
def ohe(col, df):
    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    enc.fit(df[col].unique().reshape(-1, 1))
    transformed = enc.transform(df[col].to_numpy().reshape(-1, 1))
    #Create a Pandas DataFrame of the hot encoded column
    ohe_df = pd.DataFrame(transformed, columns=enc.get_feature_names_out())
    #concat with original data
    data = pd.concat([df, ohe_df], axis=1).drop([col], axis=1)
    return data

hold out on topic modelling first because it is unsupervised algorithm

In [15]:
# df = clean_text(df)
# #generate BOW on description_story 
# #Our spaCy model:
# nlp = en_core_web_md.load()
# # Tags I want to remove from the text
# removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
# tokens = []

# for story in nlp.pipe(df['description_story_processed']):
#    proj_tok = [token.lemma_.lower() for token in story if token.pos_ not in removal and not token.is_stop and token.is_alpha]
#    tokens.append(proj_tok)
    
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# # No_below: Tokens that appear in less than 5 documents are filtered out.
# # No_above: Tokens that appear in more than 50% of the total corpus are also removed as default.
# # Keep_n: We limit ourselves to the top 1000 most frequent tokens (default is 100.000). Set to ‘None’ if you want to keep all.

# df['story_tokens'] = tokens
# print(len(df))
# print(len(tokens))

# df['story_tokens']
# dictionary = Dictionary(df['story_tokens'])

# corpus = [dictionary.doc2bow(doc) for doc in df['story_tokens']]

# lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

## Combine all feature generating functions

In [22]:
result = clean_text(df)
result = generate_nlp_features(result)
result = create_rewards_tiers(result)
result = create_all_reward_amount(result)
result = generate_sentiment_features(result)
result = generate_word_count_features(result)
result = ohe('category', result)
result = ohe('location', result)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["all_reward_amount"][i] = all_reward_amount


In [23]:
result

Unnamed: 0,id,name,description,description_story,description_risks,rewards,reward_tiers,all_reward_amount,pledged,goal,...,x0_technology/web,x0_theater/comedy,x0_theater/experimental,x0_theater/immersive,x0_theater/musical,x0_theater/plays,"x0_Orchard, Singapore","x0_Queenstown, Singapore","x0_Sembawang, Singapore","x0_Singapore, Singapore"
0,1691565384,Make 100: City maps weaved of cassette tapes,Personalised a map that contains curated music...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\nYou'll need a...,We thank all of our cassette tape donors who c...,[{'rewards': 'Pledge S$ 90 or more\n\nAbout $6...,4,"[90, 175, 249, 250]",2138.00,800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,320949924,Retrograde Hard Enamel Pins,A series of Enamel Pins based on the combinati...,StoryA series of pins based on a set of illust...,- The possibility of minor changes in details ...,[{'rewards': 'Pledge S$ 10 or more\n\nAbout US...,9,"[10, 17, 17, 34, 34, 50, 50, 64, 64]",10476.00,1200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1956852023,Owl-Carina: The Sound of Wings | MAKE 100,Cute handmade owl ocarinas and whistles. Potte...,Story\n\n\nOwl-carina\n\n\n\n\nWhat can these ...,Shipping and delivery might break the pieces. ...,[{'rewards': 'Pledge S$ 2 or more\n\nAbout $2\...,5,"[2, 24, 39, 68, 129]",7316.00,100,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1565012664,Alohomora - Magical Automated Safety Gate Unlo...,"Alohomora, an automated unlocking module for b...",,,"[{'rewards': ""Pledge S$ 2 or more\n\nAbout $2\...",7,"[2, 5, 10, 25, 39, 49, 89]",2437.00,3000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1255089705,Vario WW1 1918 Trench Watch,Vario's 3rd watch collection inspired by WW1 p...,StoryAll watches will come with a strap and ba...,We've had numerous successful crowdfunding cam...,[{'rewards': 'Pledge S$ 20 or more\n\nAbout US...,25,"[20, 32, 340, 340, 340, 340, 340, 340, 340, 34...",297149.03,30000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,730883734,Pokepuffs: RGB Starters Enamel Pins,A collection of Hard Enamel Pins featuring cut...,Story** ALL DESIGNS ARE UNLOCKED!**\n\n\n\n\n\...,There remains the possibility of delays in pro...,[{'rewards': 'Pledge S$ 12 or more\n\nAbout US...,17,"[12, 24, 36, 48, 60, 72, 84, 96, 105, 138, 173...",10478.00,300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1553,1520786796,HOROCD - The Perfect Cleaning Solution for all...,The horophile’s solution to clean and shiny wa...,Story\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWe are pr...,HOROCD has undergone intensive concoction and ...,[{'rewards': 'Pledge S$ 48 or more\n\nAbout $3...,9,"[48, 58, 68, 128, 158, 198, 258, 960, 1,780]",3614.00,2000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1554,209727311,"From where to wear, go afield with BOLDR Exped...",The BOLDR Expedition Swiss Automatic field wat...,,,[{'rewards': 'Pledge S$ 539 or more\n\nAbout $...,14,"[539, 539, 539, 539, 539, 579, 579, 579, 579, ...",229928.00,50000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1555,820148443,Kamen Rider Armor Sticker set,A sticker set of Kamen Rider Zio,StoryThis year we want to do at least one big ...,Prices might change. Singapore is going throug...,[{'rewards': 'Pledge S$ 10 or more\n\nAbout $8...,4,"[10, 10, 15, 25]",151.00,100,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Export

In [24]:
import time 

result.to_csv(f'data/kickstarter_train_final_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)

In [19]:
#since there could be more than one categories for each project, create new features to split the categories.
# df['new_category'] = df.category.str.split("/", expand=False)
# split_cat = pd.DataFrame(df['new_category'].tolist(), columns=['category1', 'category2'])
# #each project should at least have 1 category, 'category2' can be "None". 
# #'category1' being the main category for the project.
# df = pd.concat([df, split_cat], axis=1)
# df.head()