In [64]:
import pandas as pd 
import ast
import re
import numpy as np
import nltk
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [65]:
df = pd.read_csv("./data/Kickstarter_merged.csv")
df.head(3)
df_copy = df.copy()

In [66]:
df_copy.describe()

Unnamed: 0,id,pledged,goal,update_count,backers_count,is_starrable,spotlight,staff_pick
count,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0,2132.0
mean,1077279000.0,13784.319747,66046.2,5.312383,119.403846,0.009381,0.432458,0.045966
std,622544700.0,41074.680798,2166310.0,8.216737,547.130104,0.096422,0.495533,0.209461
min,1454620.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,537066300.0,147.75,1200.0,0.0,4.0,0.0,0.0,0.0
50%,1105835000.0,1594.21,5000.0,2.0,23.0,0.0,0.0,0.0
75%,1622623000.0,9547.25,15000.0,8.0,88.0,0.0,1.0,0.0
max,2145266000.0,515413.0,100000000.0,67.0,18768.0,1.0,1.0,1.0


In [67]:
df_copy.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,pledged,goal,deadline,...,spotlight,staff_pick,video,creator_name,creator_url,url,created_at,published_at,launched_at,link
0,1249154571,Bunny Care Clinic Pin and Apparel Collection,A small collection of Bunny themed enamel pins...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We try our best to keep everything on schedule...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/illustration,14115.0,700,2022-10-04 01:25:20,...,0,0,,Labutori,https://www.kickstarter.com/profile/labutori,https://www.kickstarter.com/projects/labutori/...,2022-05-25 03:28:55+00:00,2022-09-09 01:25:20+00:00,2022-09-09 01:25:20+00:00,https://www.kickstarter.com/projects/labutori/...
1,1276054891,Hustle: A Singaporean Card Game,Hustle: A Singaporean Card Game is a funny and...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,We want to be frank and honest with all our ge...,"[{'rewards': 'Pledge without a reward'}, {'rew...",games/tabletop games,6.0,50,2022-10-07 13:08:05,...,0,0,https://v2.kickstarter.com/1662721387-XBw1i2Sj...,Hustle Singapore,https://www.kickstarter.com/profile/hustlesg,https://www.kickstarter.com/projects/hustlesg/...,2022-08-20 09:52:01+00:00,2022-09-07 13:08:05+00:00,2022-09-07 13:08:05+00:00,https://www.kickstarter.com/projects/hustlesg/...
2,236207086,"Neovide, Waterless One-Stop Sous Vide Cooker",No more water containers and vacuum bags. With...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,"With our years of experience with products, ou...","[{'rewards': 'Pledge without a reward'}, {'rew...",technology,289082.0,10000,2022-10-06 13:00:04,...,0,1,https://v2.kickstarter.com/1662723490-gvkAMr9s...,The Space Tech,https://www.kickstarter.com/profile/thespacetech,https://www.kickstarter.com/projects/thespacet...,2022-06-30 09:28:52+00:00,2022-09-06 13:00:04+00:00,2022-09-06 13:00:04+00:00,https://www.kickstarter.com/projects/thespacet...
3,2128144913,Lit Cafe,Little Toasts of Happiness,StoryHi! \r\nLit Cafe is a small space to prov...,The concept is to offer affordable local food ...,"[{'rewards': 'Pledge without a reward'}, {'rew...",food/spaces,170.0,12000,2022-10-02 13:09:00,...,0,0,,Kay En,https://www.kickstarter.com/profile/hosum,https://www.kickstarter.com/projects/hosum/ho-...,2022-08-30 08:28:52+00:00,2022-09-06 04:29:02+00:00,2022-09-06 04:29:02+00:00,https://www.kickstarter.com/projects/hosum/ho-...
4,891970407,Runway Tarot & Golden Journey Tarot,When the fashion week come into Tarot.\r\nThis...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,COLLABORATION\r\nThis is the third time that b...,"[{'rewards': 'Pledge without a reward'}, {'rew...",art/painting,33599.0,6800,2022-10-05 13:57:35,...,0,0,https://v2.kickstarter.com/1662720057-aeWt13h6...,Eugene Leong,https://www.kickstarter.com/profile/locationtarot,https://www.kickstarter.com/projects/locationt...,2022-07-18 14:48:29+00:00,2022-09-05 13:57:35+00:00,2022-09-05 13:57:35+00:00,https://www.kickstarter.com/projects/locationt...


In [68]:
# process rewards string
from dataclasses import replace


def process_rewards(corpus):
    
    corpus_processed = []
    for row in corpus:
        row_processed = ""
        row = row.replace("\\n", " ")
        row = ast.literal_eval(row)

        for dict in row:
            row_processed += dict['rewards'].lower() + ' '
        
        
        row_processed = row_processed.replace("//",'')
        row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
        corpus_processed.append(row_processed)

    return corpus_processed

df_copy["rewards_processed"] = process_rewards(df_copy["rewards"])


In [69]:
df_copy["rewards_processed"]

0       pledge without a reward pledge s 12 or more  a...
1       pledge without a reward pledge s 2 or more  ab...
2       pledge without a reward pledge s 2 or more  ab...
3       pledge without a reward pledge s 7 or more  ab...
4       pledge without a reward pledge s 2 or more  ab...
                              ...                        
2127    pledge s 5 or more  about us 4    a thank you ...
2128    pledge s 40 or more  about 29    handmade by d...
2129    pledge s 7 or more  about us 6    city rawker ...
2130    pledge s 2 or more  about 2    every dollar co...
2131    pledge s 50 or more  about us 36    the online...
Name: rewards_processed, Length: 2132, dtype: object

In [70]:
# process description story
def process_description_story(corpus):
    corpus_processed = []
    for row in corpus:
        row = str(row)
        row_processed = row.replace("\r", " " )
        row_processed = row_processed.replace("\n", " " )
        row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
        corpus_processed.append(row_processed)

    return corpus_processed

df_copy["description_processed"] = process_description_story(df_copy["description"])
df_copy["description_story_processed"] = process_description_story(df_copy["description_story"])
df_copy["description_risks_processed"] = process_description_story(df_copy["description_risks"])

In [71]:
df_copy["description_processed"]

0       A small collection of Bunny themed enamel pins...
1       Hustle A Singaporean Card Game is a funny and ...
2       No more water containers and vacuum bags With ...
3                              Little Toasts of Happiness
4       When the fashion week come into Tarot  This pr...
                              ...                        
2127    Help me make flowers in Singapore affordable a...
2128    A walletcard keeper handfolded from leather Fu...
2129    Record App Studio features karaoke singing fro...
2130    Reinventing the romance genre where Happily Ev...
2131    The Online Deals is a location based discount ...
Name: description_processed, Length: 2132, dtype: object

In [72]:
df_copy["description_story_processed"][1]

'Story                                                                                                                                                                Support Us  Back HUSTLE and be one of the very first to own an exclusive digital version of a Singaporean card game about the hustle culture\xa0  Music credits Bensound Royalty Free Music  The Backstory  Hi there my name is Ah Long and Im the lead creator of HUSTLE  I would deeply appreciate your help and support to get this passion project started Your contributions will go a long way toward developing the local board game community alongside helping me to improve the products artwork quality and potential physical launch\xa0  Do note that due to logistical challenges we are currently providing backers an ecopy of the game for you to print cut out and play on your own We hope to be able to produce physical copies down the road  The concept of Hustle A Singaporean Card Game arose when I was considering the impact of the pa

In [73]:
df_copy["description_risks_processed"][1]

'We want to be frank and honest with all our generous backers so that you get the most genuine experience with our product      FUNDING   Kickstarter has an AllorNothing policy where the campaign gets 0 funding if the campaign doesnt reach its target It would mean so much to the HUSTLE team if you could help to spread and share the word about this game so it becomes a reality for you and for us  The HUSTLE team genuinely believes in this game being able to start new conversations about what it means to be financially independent in Singapore in this day and age The risk we are taking with Kickstarter is that if we are unable to be fully funded we are not able to get the funding needed'

In [74]:
X, y = df_copy[['id', 'name', 'description', 'description_story', 'description_risks',
       'rewards', 'category', 'pledged', 'goal', 'deadline', 'location',
       'state', 'faq_count', 'update_count', 'backers_count', 'is_starrable',
       'spotlight', 'staff_pick', 'video', 'creator_name', 'creator_url',
       'url', 'created_at', 'published_at', 'launched_at', 'link', 'rewards_processed', "description_processed", "description_story_processed", "description_risks_processed"]], df_copy[["state"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)

In [75]:
print(X_train.shape, X_test.shape)

(1492, 30) (640, 30)


Pipeline:

In [76]:
## y
# one hot encoding for `state`
one_hot_fit_state = OneHotEncoder().fit(y_train)
one_hot_state = one_hot_fit_state.transform(y_train).toarray()


In [77]:
one_hot_state

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [78]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [79]:
# Rewards

vect_rewards = TfidfVectorizer( 
    tokenizer=LemmatizeTokenizer(),
    lowercase=True,
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=10, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )

rewards_processed = pd.Series(X_train["rewards_processed"])
tfidf_fit_rewards = vect_rewards.fit(rewards_processed)
rewards_tfidf_array = tfidf_fit_rewards.transform(rewards_processed).toarray()



In [80]:
print(rewards_tfidf_array.shape)
print(vect_rewards.get_feature_names())
rewards_tfidf_array

(1492, 100)
['0', '0 backer', '0 backer pledge', '1', '1 backer', '10', '100', '2', '20', '2017', '2017 ship', '2017 ship anywhere', '2018', '2018 ship', '2018 ship anywhere', '2019', '2019 ship', '2019 ship anywhere', '2020', '2020 ship', '2020 ship anywhere', '2021', '2021 ship', '2021 ship anywhere', '2022', '2022 ship', '3', '4', '5', '50', '6', '8', 'add', 'addons', 'addons pledge', 'anywhere', 'anywhere world', 'anywhere world 0', 'available', 'backer', 'backer limited', 'backer pledge', 'bird', 'book', 'campaign', 'card', 'choice', 'choose', 'copy', 'dec', 'deck', 'delivery', 'delivery dec', 'design', 'digital', 'early', 'early bird', 'edition', 'enamel', 'enamel pin', 'estimated', 'estimated delivery', 'estimated delivery dec', 'exclusive', 'free', 'get', 'includes', 'kickstarter', 'le', 'le estimated', 'le estimated delivery', 'left', 'limited', 'may', 'one', 'pack', 'pin', 'pledge', 'price', 'receive', 'retail', 'retail price', 'reward', 'save', 'set', 'ship', 'ship anywhere'



array([[0.03355258, 0.03357599, 0.        , ..., 0.04115338, 0.04115338,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.48582346],
       [0.10749644, 0.10757144, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02834937, 0.02836915, 0.03591917, ..., 0.03477147, 0.03477147,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [81]:
# Description

vect_description = TfidfVectorizer( 
    tokenizer=LemmatizeTokenizer(),
    lowercase=True,
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=10, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )

description = pd.Series(X_train["description_processed"])
tfidf_fit_description = vect_description.fit(description)
description_tfidf_array = tfidf_fit_description.transform(description).toarray()

In [82]:
print(description_tfidf_array.shape)
print(vect_description.get_feature_names())
description_tfidf_array

(1492, 100)
['100', '2', 'adventure', 'affordable', 'animal', 'app', 'art', 'bag', 'based', 'book', 'brand', 'bring', 'card', 'card game', 'character', 'child', 'collection', 'create', 'creating', 'cute', 'day', 'deck', 'design', 'designed', 'device', 'different', 'dream', 'enamel', 'enamel pin', 'everyday', 'experience', 'fashion', 'feature', 'featuring', 'film', 'first', 'food', 'friend', 'fun', 'game', 'get', 'go', 'good', 'hand', 'handmade', 'hard', 'hard enamel', 'hard enamel pin', 'help', 'home', 'inspired', 'leather', 'let', 'life', 'light', 'little', 'love', 'made', 'make', 'making', 'material', 'movement', 'music', 'natural', 'need', 'never', 'new', 'one', 'organic', 'people', 'pin', 'platform', 'play', 'player', 'playing', 'project', 'quality', 'series', 'set', 'simple', 'singapore', 'singaporean', 'sticker', 'story', 'style', 'sustainable', 'tarot', 'time', 'travel', 'u', 'unique', 'use', 'wallet', 'want', 'watch', 'way', 'without', 'woman', 'world', 'year']


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.45420252, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [83]:
pd.merge(X_train, pd.DataFrame(description_tfidf_array), left_index=True, right_index=True) 


Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,pledged,goal,deadline,...,90,91,92,93,94,95,96,97,98,99
955,706124465,Arch Legends TCG,A trading card game where players immerse in o...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,The Arch Legends project took over two years t...,[{'rewards': 'Pledge S$ 70 or more\n\nAbout $5...,games/playing cards,26147.00,100000,2022-06-23 11:59:17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
197,716949770,Editable RPG Dungeon Tile Maps in Microsoft Po...,Tile Map Templates for your RPG adventure and ...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n If you k...,We have tested both the font and Microsoft Pow...,[{'rewards': 'Pledge S$ 3 or more\n\nAbout $3\...,games/gaming hardware,310.10,100,2021-08-26 04:42:24,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
724,1590447427,Looking For A New Leather Watch Strap? Start D...,Customizable hand made leather watch straps in...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,As all straps are hand made in our studio and ...,[{'rewards': 'Pledge S$ 95 or more\n\nAbout $6...,fashion/accessories,2191.00,2000,2018-06-21 17:43:37,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1112,1769605617,Usual Suspects - Designer Toys That Make a Dif...,Handcrafted toys to represent the world we liv...,StoryPhew!.. What a year it has been.. \r\nWe ...,"Due to COVID-19 and its restrictions, we had d...","[{'rewards': ""Pledge S$ 29 or more\n\nAbout $2...",design/toys,2206.00,60000,2021-01-15 16:07:23,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1391,264886056,Dave the mat,A picnic mat that can be laid down in seconds ...,Story\r\n\r\nDave The Mat\r\nDave is a picnic ...,Product may be slightly off with the measureme...,[{'rewards': 'Pledge S$ 10 or more\n\nAbout $8...,design/product design,788.00,4000,2019-03-07 14:44:17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,662646532,Unhinged Extravagance: Not Your Expected MOBA,MOBA games tend to have the same type of chara...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nMOBA game...,Gamepleton and Unhinged Extravagance are owned...,"[{'rewards': ""Pledge S$ 2 or more\n\nAbout US$...",games/video games,55.00,33000,2022-07-09 01:57:24,...,0.0,0.0,0.0,0.0,0.490666,0.0,0.0,0.0,0.0,0.0
624,1226242515,The First Reversible Leather Belt with Micro-a...,Craftmark is the first premium 2-in-1 leather ...,,,[{'rewards': 'Pledge S$ 5 or more\n\nAbout $4\...,design/product design,34775.00,16473,2019-02-18 03:00:00,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
173,1050704977,No Brainer #1-2 : A Solo Comix Anthology,Issue 2 of a solo anthology series featuring s...,StoryNo-Brainer #2! \r\n Welcome back for the...,The book is all done and ready for printing. ...,[{'rewards': 'Pledge S$ 2 or more\n\nAbout $2\...,comics/comic books,2549.00,800,2021-10-01 13:04:03,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1244,1769301520,Stampede - A Game of Racing Critters and Anima...,It’s simple: pick an animal and attempt to sur...,Story\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\...,Risks and challenges\r\nFor all further querie...,[{'rewards': 'Pledge S$ 2 or more\n\nAbout $2\...,games/tabletop games,3980.00,10000,2020-02-18 11:00:00,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [84]:
# Description Story

vect_description_story = TfidfVectorizer( 
    tokenizer=LemmatizeTokenizer(),
    lowercase=True,
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=10, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )

description_story_processed = pd.Series(X_train["description_story_processed"])
tfidf_fit_description_story_processed = vect_description_story.fit(description_story_processed)
description_story_processed_tfidf_array = tfidf_fit_description_story_processed.transform(description_story_processed).toarray()



In [85]:
print(description_story_processed_tfidf_array.shape)
print(vect_description_story.get_feature_names())
description_story_processed_tfidf_array

(1492, 100)
['0000', '1', '2', '3', 'able', 'also', 'around', 'art', 'available', 'back', 'backer', 'bag', 'best', 'book', 'campaign', 'card', 'case', 'come', 'content', 'cost', 'create', 'day', 'deck', 'design', 'designed', 'different', 'end', 'even', 'every', 'experience', 'feature', 'first', 'free', 'game', 'get', 'give', 'go', 'goal', 'good', 'ha', 'hand', 'help', 'hope', 'kickstarter', 'leather', 'life', 'like', 'look', 'love', 'made', 'make', 'making', 'many', 'material', 'may', 'much', 'need', 'new', 'one', 'part', 'people', 'pin', 'play', 'player', 'please', 'pledge', 'price', 'product', 'production', 'project', 'quality', 'reward', 'see', 'set', 'shipping', 'singapore', 'size', 'sound', 'story', 'strap', 'stretch', 'support', 'take', 'time', 'two', 'u', 'use', 'used', 'using', 'wa', 'want', 'watch', 'way', 'well', 'work', 'world', 'would', 'x', 'year', 'youll']




array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.3192699 , 0.05841054, ..., 0.54401594, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.24373262, 0.20268622, ..., 0.22023757, 0.0217347 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [86]:
# Description Risks

vect_description_risks = TfidfVectorizer( 
    tokenizer=LemmatizeTokenizer(),
    lowercase=True,
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=10, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )

description_risks_processed = pd.Series(X_train["description_risks_processed"])
tfidf_fit_description_risks_processed = vect_description_risks.fit(description_risks_processed)
description_risks_processed_tfidf_array = tfidf_fit_description_risks_processed.transform(description_risks_processed).toarray()


In [87]:
print(description_risks_processed_tfidf_array.shape)
print(vect_description_risks.get_feature_names())
description_risks_processed_tfidf_array

(1492, 100)
['1', '2', 'able', 'already', 'also', 'always', 'backer', 'best', 'book', 'campaign', 'challenge', 'change', 'colour', 'come', 'company', 'confident', 'control', 'cost', 'country', 'custom', 'date', 'delay', 'deliver', 'delivery', 'design', 'different', 'due', 'end', 'ensure', 'every', 'experience', 'first', 'fund', 'funding', 'game', 'get', 'go', 'goal', 'good', 'ha', 'help', 'however', 'issue', 'keep', 'kickstarter', 'know', 'like', 'made', 'make', 'manufacturer', 'manufacturing', 'many', 'material', 'may', 'might', 'month', 'much', 'need', 'new', 'one', 'order', 'package', 'partner', 'people', 'pin', 'please', 'possible', 'printing', 'problem', 'process', 'product', 'production', 'project', 'prototype', 'quality', 'ready', 'reward', 'risk', 'shipping', 'singapore', 'still', 'supplier', 'support', 'sure', 'take', 'team', 'thank', 'time', 'timeline', 'try', 'u', 'unforeseen', 'update', 'wa', 'way', 'well', 'work', 'working', 'would', 'year']


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.14324263],
       [0.        , 0.        , 0.38895092, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.2660757 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.07796348, 0.        ,
        0.        ],
       [0.        , 0.        , 0.2189543 , ..., 0.41382884, 0.        ,
        0.22626965]])

In [88]:
# Count vectorizer

# tokenize, one hot encoding, count vector
vect = CountVectorizer( 
    analyzer='word', 
    ngram_range=(1,3), # unigram, bigram and trigram 
    max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
    min_df=2, # minimum word frequency required to be in model
    stop_words=stopwords.words('english') # remove stopwords
    )
count_fit = vect.fit(pd.Series(X_train["rewards_processed"]))
rewards_count_array = count_fit.transform(pd.Series(X_train["rewards_processed"])).toarray()

**Ignore below stuff**

In [89]:
# #tokenize
# tokenizer = TreebankWordTokenizer()
# X_train["rewards_tokens"] = X_train["rewards_processed"].apply(lambda x : tokenizer.tokenize(x))

# # remove stopwords

# stop = stopwords.words('english')
# custom_stopwords = ['$']
# for word in custom_stopwords:
#     stop.append(word)
    
# X_train["rewards_tokens"] = X_train["rewards_tokens"].apply(lambda x : [word for word in x if word not in stop])


In [90]:
# # POS tagging
# from nltk.corpus import wordnet

# def get_part_of_speech_tags(token):
    
#     """Maps POS tags to first character of the pos tagging returned by pos_tag.
#     We are focussing on Verbs, Nouns, Adjectives and Adverbs; otherwise return NOUN"""

#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}
    
#     tag = nltk.pos_tag([token])[0][1][0].upper()
    
#     return tag_dict.get(tag, wordnet.NOUN)

# # lemmatize
# lemmatizer = WordNetLemmatizer()
# X_train["rewards_tokens_lemmatized_POS"] = X_train["rewards_tokens"].apply(lambda x : [lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in x])


In [91]:
# # n-gram
# X_train["rewards_tokens_lemmatized_POS_bigrams"] = X_train["rewards_tokens_lemmatized_POS"].apply(lambda x : list(ngrams(x, 2)))
# X_train["rewards_tokens_lemmatized_POS_trigrams"] = X_train["rewards_tokens_lemmatized_POS"].apply(lambda x : list(ngrams(x, 3)))


In [92]:
# #stemming
# ps = PorterStemmer()
# X_train["rewards_tokens"] = X_train["rewards_tokens"].apply(lambda x : [ps.stem(word) for word in x])