In [1]:
import pandas as pd 
import numpy as np
from __future__ import unicode_literals
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Reading in the data
kick_data = pd.read_csv("/Users/briankalinowski/Kickstarter-Data-Analysis/Kick_funded.csv",encoding="utf-8")
del kick_data['Unnamed: 0']

In [3]:
kick_data.head()

Unnamed: 0,pledged_amount,pitch,by,category,currency,goal,location,backers,backers_by_tier,pledge_tier,title,url
0,8782571.0,\nThis is a card game for people who are into ...,Elan Lee,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,/projects/elanlee/exploding-kittens
1,6465690.0,"\nAn unusually addicting, high-quality desk to...",Matthew and Mark McLachlan,Product Design,usd,15000.0,"Denver, CO",154926,"[788, 250, 43073, 21796, 41727, 21627, 12215, ...","[1.0, 14.0, 19.0, 19.0, 35.0, 35.0, 79.0, 79.0...",Fidget Cube: A Vinyl Desk Toy,/projects/antsylabs/fidget-cube-a-vinyl-desk-toy
2,5408916.0,\nBring Reading Rainbow’s library of interacti...,LeVar Burton & Reading Rainbow,Web,usd,1000000.0,"Los Angeles, CA",105857,"[19639, 14343, 9136, 2259, 5666, 24512, 4957, ...","[5.0, 10.0, 25.0, 30.0, 35.0, 50.0, 75.0, 100....","Bring Reading Rainbow Back for Every Child, Ev...",/projects/readingrainbow/bring-reading-rainbow...
3,5702153.0,\nUPDATED: This is it. We're making a Veronica...,Rob Thomas,Narrative Film,usd,2000000.0,"San Diego, CA",91585,"[5938, 8423, 11509, 22997, 23227, 1865, 7260, ...","[1.0, 10.0, 25.0, 35.0, 50.0, 75.0, 100.0, 150...",The Veronica Mars Movie Project,/projects/559914737/the-veronica-mars-movie-pr...
4,3336371.0,"\nAn adventure game from Tim Schafer, Double F...",Double Fine and 2 Player Productions,Video Games,usd,400000.0,"San Francisco, CA",87142,"[47946, 24636, 1090, 11530, 900, 148, 100, 10, 4]","[15.0, 30.0, 60.0, 100.0, 250.0, 500.0, 1000.0...",Double Fine Adventure,/projects/doublefine/double-fine-adventure


# Analysis using SpaCy

In [4]:
from spacy.en import English
nlp_toolkit = English()
nlp_toolkit

<spacy.en.English at 0x11aae7e10>

In [5]:
# Noun chunks out of the title

nouns = []
for title in kick_data.title:
    n_doc = nlp_toolkit(title)
    for np in n_doc.noun_chunks:
        nouns.append(np)



In [6]:
# Get an idea of what ent types are in the titles.
entities = []

for title in kick_data.title:
    doc = nlp_toolkit(title)
    for ent in doc.ents:
        entities.append(ent.label_)

In [7]:
# most of the entities are Person or ORG, but there are some others as well. 
pd.Series(entities).unique()


array([u'PERSON', u'ORG', u'LANGUAGE', u'CARDINAL', u'WORK_OF_ART', u'GPE',
       u'TIME', u'ORDINAL', u'MONEY', u'DATE', u'FAC', u'PERCENT', u'NORP',
       u'PRODUCT', u'LOC', u'EVENT', u'LAW', u'QUANTITY'], dtype=object)

In [8]:
#the most comon entities. 
def entites(title):
    parsed = nlp_toolkit(title)
    has_org = any([word.ent_type_ == 'ORG' for word in parsed])
    has_person = any([word.ent_type_ == 'PERSON' for word in parsed])
    return has_org and has_person

kick_data['entities'] = kick_data['title'].fillna('').map(entites)  
kick_data[kick_data['entities']][['title']].head(10)

Unnamed: 0,title
52,Remix Mini - The World's First True Android PC.
74,Purple\xae Pillow: The World's First No-Pressu...
120,Reaper Miniatures Bones 3: The Search for Mr. ...
141,ZANO - Autonomous. Intelligent. Swarming. Nano...
173,"Bobby, the Best Anti Theft backpack by XD Design"
178,Smart Herb Garden by Click & Grow
236,Toejam and Earl: Back in the Groove
238,Smart Parka - The World's First Complete Winte...
242,G-RO: Revolutionary Carry-on Luggage
360,JUMP Cable by Native Union


In [9]:
def references_language(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'LANGUAGE' for word in parsed])

kick_data['title_language'] = kick_data['title'].fillna('').map(references_language)  
kick_data['pitch_language'] = kick_data['pitch'].fillna('').map(references_language)

In [10]:
def references_cardinal(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'CARDINAL' for word in parsed])

kick_data['title_cardinal'] = kick_data['title'].fillna('').map(references_cardinal)  
kick_data['pitch_cardinal'] = kick_data['pitch'].fillna('').map(references_cardinal)  

In [11]:
def references_art(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'WORK_OF_ART' for word in parsed])

kick_data['title_art'] = kick_data['title'].fillna('').map(references_art)  
kick_data['pitch_art'] = kick_data['pitch'].fillna('').map(references_art)  

In [12]:
def references_gpe(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'GPE' for word in parsed])

kick_data['title_gpe'] = kick_data['title'].fillna('').map(references_gpe)  
kick_data['pitch_gpe'] = kick_data['pitch'].fillna('').map(references_gpe) 

In [13]:
def references_time(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'TIME' for word in parsed])

kick_data['title_time'] = kick_data['title'].fillna('').map(references_time)  
kick_data['pitch_time'] = kick_data['pitch'].fillna('').map(references_time)  

In [14]:
def references_money(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'MONEY' for word in parsed])

kick_data['title_money'] = kick_data['title'].fillna('').map(references_money)  
kick_data['pitch_money'] = kick_data['pitch'].fillna('').map(references_money)  

In [15]:
def references_date(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'DATE' for word in parsed])

kick_data['title_date'] = kick_data['title'].fillna('').map(references_date)  
kick_data['pitch_date'] = kick_data['pitch'].fillna('').map(references_date)  

In [16]:
def references_fac(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'FAC' for word in parsed])

kick_data['title_fac'] = kick_data['title'].fillna('').map(references_fac)  
kick_data['pitch_fac'] = kick_data['pitch'].fillna('').map(references_fac)  

In [17]:
def references_norp(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'NORP' for word in parsed])

kick_data['title_norp'] = kick_data['title'].fillna('').map(references_norp)  
kick_data['pitch_norp'] = kick_data['pitch'].fillna('').map(references_norp)  

In [18]:
def references_product(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'PRODUCT' for word in parsed])

kick_data['title_product'] = kick_data['title'].fillna('').map(references_product)  
kick_data['pitch_product'] = kick_data['pitch'].fillna('').map(references_product)  

In [19]:
def references_loc(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'LOC' for word in parsed])

kick_data['title_loc'] = kick_data['title'].fillna('').map(references_loc)  
kick_data['pitch_loc'] = kick_data['pitch'].fillna('').map(references_loc)  

In [20]:
def references_event(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'EVENT' for word in parsed])

kick_data['title_event'] = kick_data['title'].fillna('').map(references_event)  
kick_data['pitch_event'] = kick_data['pitch'].fillna('').map(references_event)  

In [21]:
def references_law(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'LAW' for word in parsed])

kick_data['title_law'] = kick_data['title'].fillna('').map(references_law)  
kick_data['pitch_law'] = kick_data['pitch'].fillna('').map(references_law)  

In [22]:
def references_quantity(text):
    parsed = nlp_toolkit(text)
    return any([word.ent_type_ == 'QUANTITY' for word in parsed])

kick_data['title_quantity'] = kick_data['title'].fillna('').map(references_quantity)  
kick_data['pitch_quantity'] = kick_data['pitch'].fillna('').map(references_quantity)

In [23]:
pd.crosstab(kick_data['entities'], kick_data['category'])


category,3D Printing,Academic,Accessories,Action,Animation,Anthologies,Apparel,Apps,Architecture,Art,...,Vegan,Video,Video Games,Wearables,Web,Webcomics,Webseries,World Music,Young Adult,Zines
entities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,23,1,20,1,48,23,30,17,2,14,...,5,1,511,63,13,33,31,2,1,1
True,0,0,0,0,1,0,2,0,0,1,...,0,0,13,1,0,0,1,0,0,0


In [24]:
pd.crosstab(kick_data['pitch_art'], kick_data['category'])


category,3D Printing,Academic,Accessories,Action,Animation,Anthologies,Apparel,Apps,Architecture,Art,...,Vegan,Video,Video Games,Wearables,Web,Webcomics,Webseries,World Music,Young Adult,Zines
pitch_art,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,22,1,20,1,44,22,32,16,2,15,...,5,1,510,64,13,33,24,2,1,1
True,1,0,0,0,5,1,0,1,0,0,...,0,0,14,0,0,0,8,0,0,0


In [25]:
pd.crosstab(kick_data['pitch_product'], kick_data['category'])



category,3D Printing,Academic,Accessories,Action,Animation,Anthologies,Apparel,Apps,Architecture,Art,...,Vegan,Video,Video Games,Wearables,Web,Webcomics,Webseries,World Music,Young Adult,Zines
pitch_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,23,1,20,1,48,23,32,17,2,15,...,5,1,511,64,13,33,31,2,1,1
True,0,0,0,0,1,0,0,0,0,0,...,0,0,13,0,0,0,1,0,0,0


In [26]:
kick_data[kick_data['pitch_art']][['title']].head(10)

Unnamed: 0,title
42,"Theatre Is Evil: the album, art book and tour"
55,System Shock
138,Tak: A Beautiful Game
157,The Name of the Wind Playing Cards
264,"""Chug"" from the team that brought you ""Three S..."
338,The Resistance - Hostile Intent & Hidden Agenda
339,Small World 2: The Return!
341,Masters Of Anatomy
355,StoneTether - The Smallest Tracking Device at ...
380,Twilight Struggle Digital Edition


In [27]:
kick_data[kick_data['entities']][['title']].head(10)

Unnamed: 0,title
52,Remix Mini - The World's First True Android PC.
74,Purple\xae Pillow: The World's First No-Pressu...
120,Reaper Miniatures Bones 3: The Search for Mr. ...
141,ZANO - Autonomous. Intelligent. Swarming. Nano...
173,"Bobby, the Best Anti Theft backpack by XD Design"
178,Smart Herb Garden by Click & Grow
236,Toejam and Earl: Back in the Groove
238,Smart Parka - The World's First Complete Winte...
242,G-RO: Revolutionary Carry-on Luggage
360,JUMP Cable by Native Union


In [28]:
kick_data.columns

Index([u'pledged_amount', u'pitch', u'by', u'category', u'currency', u'goal',
       u'location', u'backers', u'backers_by_tier', u'pledge_tier', u'title',
       u'url', u'entities', u'title_language', u'pitch_language',
       u'title_cardinal', u'pitch_cardinal', u'title_art', u'pitch_art',
       u'title_gpe', u'pitch_gpe', u'title_time', u'pitch_time',
       u'title_money', u'pitch_money', u'title_date', u'pitch_date',
       u'title_fac', u'pitch_fac', u'title_norp', u'pitch_norp',
       u'title_product', u'pitch_product', u'title_loc', u'pitch_loc',
       u'title_event', u'pitch_event', u'title_law', u'pitch_law',
       u'title_quantity', u'pitch_quantity'],
      dtype='object')

In [29]:
kick_data[kick_data['entites']][['title']].head(20)

KeyError: u'entites'

In [None]:
#Extracting the nouns chunks from each of the campaings.

nouns = []
for cell in kick_data.pitch:
    n_doc = nlp_toolkit(cell)
    for np in n_doc.noun_chunks:
        nouns.append(np)



In [None]:
titles = ['title'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 1000, 
                             ngram_range=(1, 2), 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)
X