In [75]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from textblob import TextBlob
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
import pyLDAvis.gensim_models
# import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bandy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bandy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bandy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bandy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [76]:
df = pd.read_csv("./data/kickstarter_train.csv")

In [77]:
# convert 'video' to a binary categorical variable
df['video'].value_counts()
df['has_video'] = df['video'].apply(lambda x: 0 if pd.isnull(x) else 1)
df['has_video'].value_counts()

df['faq_count'] = df['faq_count'].apply(lambda x: float(x.replace(",", ""))) # transfer over to feature eng

## NLP features

In [78]:
# Text cleaning for: rewards, description, description story, description risks

def clean_text(df):
    def process_rewards(corpus):
    
        corpus_processed = []
        for row in corpus:
            row_processed = ""
            row = row.replace("\\n", " ")
            row = ast.literal_eval(row)

            for dict in row:
                row_processed += dict['rewards'].lower() + ' '
            
            
            row_processed = row_processed.replace("//",'')
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed)

        return corpus_processed
    
    def process_description_story(corpus):
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = row.replace("\r", " " )
            row_processed = row_processed.replace("\n", " " )
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not pd.isnull(row_processed) else "") # handle NA

        return corpus_processed

    df["rewards_processed"] = process_rewards(df["rewards"])
    df["description_processed"] = process_description_story(df["description"])
    df["description_story_processed"] = process_description_story(df["description_story"])
    df["description_risks_processed"] = process_description_story(df["description_risks"])

    return df


In [79]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [80]:
# need to return the vectorizer to use on the test data to transform it 
def generate_nlp_features(df):
    
    # Rewards

    vect_rewards = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    rewards_processed = pd.Series(df["rewards_processed"])
    tfidf_fit_rewards = vect_rewards.fit(rewards_processed)
    rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()
    rewards_tfidf_df = pd.DataFrame(rewards_tfidf_array)
    rewards_tfidf_df.columns = list(map(lambda x : "rewards_" + str(x), rewards_tfidf_df.columns))
    df = pd.merge(df, rewards_tfidf_df , left_index=True, right_index=True)
    

    # Description

    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description)
    description_tfidf_array = tfidf_fit_description.transform(description).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)


    # Description Story

    vect_description_story = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_story_processed = pd.Series(df["description_story_processed"])
    tfidf_fit_description_story_processed = vect_description_story.fit(description_story_processed)
    description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(description_story_processed).toarray()
    description_story_tfidf_df = pd.DataFrame(description_story_processed_tfidf_array)
    description_story_tfidf_df.columns = list(map(lambda x : "description_story_" + str(x), description_story_tfidf_df.columns))
    df = pd.merge(df, description_story_tfidf_df , left_index=True, right_index=True)


    # Description Risks

    vect_description_risks = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    description_risks_processed = pd.Series(df["description_risks_processed"])
    tfidf_fit_description_risks_processed = vect_description_risks.fit(description_risks_processed)
    description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(description_risks_processed).toarray()
    description_risks_tfidf_df = pd.DataFrame(description_risks_processed_tfidf_array)
    description_risks_tfidf_df.columns = list(map(lambda x : "description_risks_" + str(x), description_risks_tfidf_df.columns))
    df = pd.merge(df, description_risks_tfidf_df , left_index=True, right_index=True)

    return (df, tfidf_fit_rewards, tfidf_fit_description, tfidf_fit_description_story_processed, tfidf_fit_description_risks_processed)


def nlp_transform_test(df, tfidf_fit_rewards, tfidf_fit_description, tfidf_fit_description_story_processed, tfidf_fit_description_risks_processed):
    rewards_tfidf_array = tfidf_fit_rewards.transform(df['rewards_processed']).toarray()
    rewards_tfidf_df = pd.DataFrame(rewards_tfidf_array)
    rewards_tfidf_df.columns = list(map(lambda x : "rewards_" + str(x), rewards_tfidf_df.columns))
    df = pd.merge(df, rewards_tfidf_df , left_index=True, right_index=True)
    
    description_tfidf_array = tfidf_fit_description.transform(df['description_processed']).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)
    
    description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(df['description_story_processed']).toarray()
    description_story_tfidf_df = pd.DataFrame(description_story_processed_tfidf_array)
    description_story_tfidf_df.columns = list(map(lambda x : "description_story_" + str(x), description_story_tfidf_df.columns))
    df = pd.merge(df, description_story_tfidf_df , left_index=True, right_index=True)
    
    description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(df['description_risks_processed']).toarray()
    description_risks_tfidf_df = pd.DataFrame(description_risks_processed_tfidf_array)
    description_risks_tfidf_df.columns = list(map(lambda x : "description_risks_" + str(x), description_risks_tfidf_df.columns))
    df = pd.merge(df, description_risks_tfidf_df , left_index=True, right_index=True)
    
    return df


In [81]:
for x in df.columns:
    print(x)

id
name
description
description_story
description_risks
rewards
category
pledged
goal
deadline
location
state
faq_count
update_count
backers_count
spotlight
staff_pick
video
launched_at
has_video


## Rewards Features

<font color="red"><strong>I dont think list can be a input to ML</strong></font>

In [82]:
def create_rewards_tiers(df):
    df["reward_tiers"] = df["rewards"].apply(lambda x : len(ast.literal_eval(x)))
    df = move_reward_tiers(df)
    return df

def create_min_max_reward(df):
    df["min_reward"] = 0
    df["max_reward"] = 0

    for i in range(len(df)):
        all_reward_amount = []
        dict_list = ast.literal_eval(df.iloc[i, 5]) # Converts rewards column into dictionary

        for dict in dict_list:
            values_string = str(dict.values())
            reward_title = re.search(r"Pledge S\$ \d{1,3}(,\d{1,3})? or more", values_string) # Search for all reward titles

            if reward_title is not None:
                reward_amount = re.search(r"\d{1,3}(,\d{1,3})?", reward_title.group()) # Search for only the digits in reward amount
                if reward_amount is not None:
                    integer_amount = re.sub("[^\d\.]", "", reward_amount.group())
                    all_reward_amount.append(int(integer_amount))
            else:
                all_reward_amount.append(0) # If no reward title is found, add 0
                
        if len(all_reward_amount) > 0:
            df["min_reward"][i] = min(all_reward_amount)
            df["max_reward"][i] = max(all_reward_amount)   
    df = move_min_max_reward(df)
    return df

# Rearange reward_tiers column to the right of rewards
def move_reward_tiers(df):
    cols = df.columns.tolist()
    cols = cols[:6] + [cols[-1]] + cols[6:-1]
    df = df[cols]
    return df

# Rearange min and max reward columns to the right of reward_tiers
def move_min_max_reward(df):
    cols = df.columns.tolist()
    cols = cols[:7] + cols[-2:] + cols[7:-2]
    df = df[cols]
    return df

  integer_amount = re.sub("[^\d\.]", "", reward_amount.group())


## Sentiment Features

In [84]:
'''
Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].
'''
def generate_sentiment_features(df):
    df = df.dropna(subset=['description_story_processed', 'description_risks_processed', 'description_processed', 'rewards_processed']) # NOTE: put at top with other dropnas from other features?
    df["description_story_polarity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_story_subjectivity"] = df["description_story_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_polarity"] = df["description_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_subjectivity"] = df["description_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["description_risks_polarity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["description_risks_subjectivity"] = df["description_risks_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)
    df["rewards_polarity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity)
    df["rewards_subjectivity"] = df["rewards_processed"].apply(lambda x: 
                   TextBlob(x).sentiment.subjectivity)          
    return df

In [85]:
def generate_word_count_features(df): # omitted description due to word limit, word count likely similar for all projects
    df['description_story_word_count'] = df["description_story_processed"].apply(lambda x: len(str(x).split(" ")))
    df['description_risks_word_count'] = df["description_risks_processed"].apply(lambda x: len(str(x).split(" ")))
    df['rewards_word_count'] = df["rewards_processed"].apply(lambda x: len(str(x).split(" ")))
    return df

## One-hot Encoding of Categorical Variables

when to drop one of the OHE columns: 
https://stats.stackexchange.com/questions/231285/dropping-one-of-the-columns-when-using-one-hot-encoding

In [86]:
def ohe_transform(enc, col, df):
    transformed = enc.transform(df[col].to_numpy().reshape(-1, 1))
    #Create a Pandas DataFrame of the hot encoded column
    ohe_df = pd.DataFrame(transformed, columns=enc.get_feature_names_out())
    #concat with original data
    data = pd.concat([df, ohe_df], axis=1)#.drop([col], axis=1)
    return data

def ohe_fit(col, df):
    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    enc.fit(df[col].unique().reshape(-1, 1))
    return (ohe_transform(enc, col, df), enc)

hold out on topic modelling first because it is unsupervised algorithm

## Combine all feature generating functions

In [87]:
result = create_rewards_tiers(df)
result = create_min_max_reward(result)
result = clean_text(result)
result, tfidf_fit_rewards, tfidf_fit_description, tfidf_fit_description_story_processed, tfidf_fit_description_risks_processed = generate_nlp_features(result)
result = generate_sentiment_features(result)
result = generate_word_count_features(result)
result, category_encoder = ohe_fit('category', result) #use encoder to fit train data
result, location_encoder = ohe_fit('location', result) #use encoder to fit train data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["min_reward"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["max_reward"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["min_reward"][i] = min(all_reward_amount)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

## Apply same feature engineering on Test Data

In [88]:
test = pd.read_csv("./data/kickstarter_test.csv")

test = clean_text(test)
test['has_video'] = test['video'].apply(lambda x: 0 if pd.isnull(x) else 1)
test['faq_count'] = test['faq_count'].apply(lambda x: float(x.replace(",", ""))) # transfer over to feature eng
#test = generate_nlp_features(test)
test = nlp_transform_test(test, tfidf_fit_rewards, tfidf_fit_description, tfidf_fit_description_story_processed, tfidf_fit_description_risks_processed)
test = create_rewards_tiers(test)
test = create_min_max_reward(test)
test = generate_sentiment_features(test)
test = generate_word_count_features(test)

#encoders
test = ohe_transform(category_encoder, 'category', test)
test = ohe_transform(location_encoder, 'location', test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["min_reward"][i] = min(all_reward_amount)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["max_reward"][i] = max(all_reward_amount)


In [89]:
print(len(test.columns))
print(len(result.columns))

for i in result.columns:
    if i not in test.columns:
        print(i)

563
563


## Export

In [90]:
import time 

result.to_csv(f'data/train/kickstarter_train_final_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
test.to_csv(f'data/test/kickstarter_test_final_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)

In [91]:
#since there could be more than one categories for each project, create new features to split the categories.
# df['new_category'] = df.category.str.split("/", expand=False)
# split_cat = pd.DataFrame(df['new_category'].tolist(), columns=['category1', 'category2'])
# #each project should at least have 1 category, 'category2' can be "None". 
# #'category1' being the main category for the project.
# df = pd.concat([df, split_cat], axis=1)
# df.head()