## import libraries

In [1]:
import glob
import calendar
import pandas as pd
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt

In [2]:
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [38]:
pd.options.display.max_colwidth = 100

## import data

In [3]:
df_all = pd.concat(map(pd.read_csv, glob.glob('data/*.csv')))

In [4]:
df_all.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'country_displayable_name', 'created_at', 'creator',
       'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged',
       'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type'],
      dtype='object')


Description of each column:

- backers_count - number of people who contributed funds to the project
- blurb - short description of the project
- category - contains the category and sub-category of the project
- converted_pledged_amount - amount of money pledged, converted to the currency in the 'current_currency' column
- country - country the project creator is from
- created_at - date and time of when the project was initially created on Kickstarter
- creator - name of the project creator and other information about them, e.g. Kickstarter id number
- currency - original currency the project goal was denominated in
- currency_symbol - symbol of the original currency the project goal was denominated in
- currency_trailing_code - code of the original currency the project goal was denominated in
- current_currency - currency the project goal was converted to
- deadline - date and time of when the project will close for donations
- disable_communication - whether or not a project owner disabled communication with their backers
- friends - unclear (null or empty)
- fx_rate - foreign exchange rate between the original currency and the current_currency
- goal - funding goal
- id - id number of the project
- is_backing - unclear (null or false)
- is_starrable - whether or not a project can be starred (liked and saved) by users
- is_starred - whether or not a project has been starred (liked and saved) by users
- launched_at - date and time of when the project was launched for funding
- location - contains the town or city of the project creator
- name - name of the project
- permissions - unclear (null or empty)
- photo - contains a link and information to the project's photo/s
- pledged - amount pledged in the current_currency
- profile - details about the project's profile, including id number and various visual settings
- slug - name of the project with hyphens instead of spaces
- source_url - url for the project's category
- spotlight - after a project has been successful, it is spotlighted on the Kickstarter website
- staff_pick - whether a project was highlighted as a staff_pick when it was launched/live
- state - whether a project was successful, failed, canceled, suspending or still live
- state_changed_at - date and time of when a project's status was changed (same as the deadline for successful and failed projects)
- static_usd_rate - conversion rate between the original currency and USD
- urls - url to the project's page
- usd_pledged - amount pledged in USD
- usd_type - domestic or international

In [5]:
df_all.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,4,Looking for donations and feedback on my ideas...,"{""id"":356,""name"":""Woodworking"",""slug"":""crafts/...",200,US,the United States,1449200332,"{""id"":240817199,""name"":""Tim"",""is_registered"":n...",USD,$,...,wood-butcher-block-and-jewelry-boxes,https://www.kickstarter.com/discover/categorie...,False,False,failed,1451839263,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",200.0,international
1,545,Logan McBride was a magical girl with a destin...,"{""id"":252,""name"":""Graphic Novels"",""slug"":""comi...",26773,US,the United States,1494092401,"{""id"":248241887,""name"":""Pat Shand"",""slug"":""pat...",USD,$,...,destiny-ny-volume-two,https://www.kickstarter.com/discover/categorie...,True,True,successful,1506643200,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",26773.0,international
2,267,Wonderful 3D Popup Christmas Cards. Opening th...,"{""id"":351,""name"":""Printing"",""slug"":""crafts/pri...",10989,DE,Germany,1444289040,"{""id"":225932455,""name"":""nalacards"",""slug"":""nal...",EUR,€,...,wonderful-3d-popup-christmas-cards-small-yet-m...,https://www.kickstarter.com/discover/categorie...,True,True,successful,1447016412,1.139062,"{""web"":{""project"":""https://www.kickstarter.com...",11653.747824,international
3,96,Hi everyone. I love animals and I love to croc...,"{""id"":344,""name"":""Crochet"",""slug"":""crafts/croc...",1649,CA,Canada,1547488018,"{""id"":1240360759,""name"":""Tabetha Jollimore"",""i...",CAD,$,...,animal-scarves-to-help-the-animals,https://www.kickstarter.com/discover/categorie...,True,False,successful,1550678590,0.7541,"{""web"":{""project"":""https://www.kickstarter.com...",1641.894476,international
4,444,"Slim, Earth-friendly hangers with an innovativ...","{""id"":28,""name"":""Product Design"",""slug"":""desig...",26752,US,the United States,1527556474,"{""id"":1106375862,""name"":""Ensu Design"",""is_regi...",USD,$,...,mozu-hanger,https://www.kickstarter.com/discover/categorie...,True,False,successful,1598108164,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",26752.0,international


In [6]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213583 entries, 0 to 3679
Data columns (total 38 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             213583 non-null  int64  
 1   blurb                     213576 non-null  object 
 2   category                  213583 non-null  object 
 3   converted_pledged_amount  213583 non-null  int64  
 4   country                   213583 non-null  object 
 5   country_displayable_name  213583 non-null  object 
 6   created_at                213583 non-null  int64  
 7   creator                   213583 non-null  object 
 8   currency                  213583 non-null  object 
 9   currency_symbol           213583 non-null  object 
 10  currency_trailing_code    213583 non-null  bool   
 11  current_currency          213583 non-null  object 
 12  deadline                  213583 non-null  int64  
 13  disable_communication     213583 non-null  boo

In [7]:
df_all.shape

(213583, 38)

## data preprocessing

### for null and other inapplicable columns

In [8]:
# drop columns of platform interactions containing few non-null
df = df_all.drop(['friends', 'is_backing', 'is_starred', 'permissions'], axis=1, inplace=False)

In [9]:
# drop columns with data already included elsewhere in the dataset
df.drop(['converted_pledged_amount', 'country_displayable_name', 'pledged'], 
        axis=1, inplace=True)

In [10]:
# drop columns that contain identifiers project by project
df.drop(['creator', 'location', 'profile', 'slug', 'source_url', 'urls'], 
        axis=1, inplace=True)

In [11]:
# drop columns unnecessary for use in this analysis
df.drop(['photo', 'currency', 'currency_symbol', 'currency_trailing_code', 'current_currency', 
         'fx_rate', 'usd_type'], axis=1, inplace=True)

In [12]:
#drop columns which are tied to the target
df.drop(['spotlight', 'state_changed_at'], axis=1, inplace=True)

In [13]:
# Drop disable_communication - bools are all false
df.drop('disable_communication', axis=1, inplace=True)

### for datetime columns

In [14]:
# Converting dates from unix to datetime
date_cols = ['created_at', 'deadline', 'launched_at']
for col in date_cols:
    df[[col]] = pd.to_datetime(df[col], origin='unix', unit='s')

In [25]:
print(f"""The dataset contains projects added to Kickstarter between {min(df.created_at).strftime('%B %d %Y')} and {max(df.created_at).strftime('%B %d %Y')}.""")

The dataset contains projects added to Kickstarter between April 22 2009 and March 18 2021.


### for duplicate rows

In [42]:
#see sum of duplicate rows by 'id'
df_all.duplicated(subset='id').sum()

25769

In [43]:
#see sum of duplicate rows by 'urls'
df_all.duplicated(subset='urls').sum()

25679

In [66]:
df = df.reset_index(drop=True) #reset index to easily search by integer-based location

In [67]:
#see duplicate rows by 'id'
df[df["id"].isin(df["id"][df["id"].duplicated()])]

Unnamed: 0,backers_count,blurb,category,country,created_at,deadline,goal,id,is_starrable,launched_at,name,staff_pick,state,static_usd_rate,usd_pledged
2,267,Wonderful 3D Popup Christmas Cards. Opening this greeting card reveals a brilliantly crafted scu...,"{""id"":351,""name"":""Printing"",""slug"":""crafts/printing"",""position"":9,""parent_id"":26,""parent_name"":""...",DE,2015-10-08 07:24:00,2015-11-08 21:00:00,2500.0,1484057434,False,2015-10-16 01:09:33,"Wonderful 3D Popup Christmas Cards: Small, Yet Meaningful",True,successful,1.139062,11653.747824
3,96,Hi everyone. I love animals and I love to crochet. I want to combine the two to help my animal r...,"{""id"":344,""name"":""Crochet"",""slug"":""crafts/crochet"",""position"":2,""parent_id"":26,""parent_name"":""Cr...",CA,2019-01-14 17:46:58,2019-02-20 16:03:10,100.0,1955625514,False,2019-01-21 16:03:10,Animal Scarves to Help the Animals!,False,successful,0.754100,1641.894476
4,444,"Slim, Earth-friendly hangers with an innovative notch that eliminates the stretching of collars.","{""id"":28,""name"":""Product Design"",""slug"":""design/product design"",""position"":5,""parent_id"":7,""pare...",US,2018-05-29 01:14:34,2020-08-22 14:56:04,7500.0,814342621,False,2020-07-13 14:56:04,Mozu Hanger - The Friendliest Hanger on the Planet,False,successful,1.000000,26752.000000
7,3,"Introducing the Primal Conceps ""LUCILLE"" product line based from the popular television show The...","{""id"":356,""name"":""Woodworking"",""slug"":""crafts/woodworking"",""position"":14,""parent_id"":26,""parent_...",US,2017-03-06 22:30:36,2017-04-12 16:13:36,100.0,1403250866,False,2017-03-13 16:13:36,Lucy Keychain bats,False,successful,1.000000,171.000000
9,32,The Seven Handmade Mechanical Models with Steampunk Style.,"{""id"":345,""name"":""DIY"",""slug"":""crafts/diy"",""position"":3,""parent_id"":26,""parent_name"":""Crafts"",""c...",HK,2018-05-07 14:20:46,2018-06-14 15:31:47,15000.0,615857771,False,2018-05-15 15:31:47,The Battle Warriors of Steam Punks,False,successful,0.127389,3286.861792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213577,26,Bronwyn: The Further Adventures- Issue 1,"{""id"":250,""name"":""Comic Books"",""slug"":""comics/comic books"",""position"":2,""parent_id"":3,""parent_na...",AU,2020-07-09 01:30:11,2020-08-16 03:20:06,1000.0,1251772211,False,2020-07-17 03:20:06,Bronwyn- The Further Adventures- Issue 1,False,successful,0.699198,825.276829
213578,27,I have received a scholarship to the famous Broadway Dance Center in New York City; But I need y...,"{""id"":255,""name"":""Residencies"",""slug"":""dance/residencies"",""position"":2,""parent_id"":6,""parent_nam...",GB,2016-06-28 21:11:16,2016-08-26 19:37:00,1000.0,434619322,False,2016-07-25 13:26:30,Help support me in NEW YORK!,False,successful,1.313831,1625.208737
213579,206,MONSTERS✔️ ALIENS✔️ DEMONS✔️ KILLER CLOWNS✔️ TERRORIST✔️ AND A VIETNAM VETERAN TURNED ULTRA VIOL...,"{""id"":250,""name"":""Comic Books"",""slug"":""comics/comic books"",""position"":2,""parent_id"":3,""parent_na...",US,2021-01-03 21:55:08,2021-03-06 05:00:00,2000.0,1262668783,False,2021-02-06 02:01:16,DEAD END #1,False,successful,1.000000,9816.000000
213580,31,Anna Reyes is a 2015 Resident Dance Artist for the Boston Center for the Arts. Her residency wil...,"{""id"":255,""name"":""Residencies"",""slug"":""dance/residencies"",""position"":2,""parent_id"":6,""parent_nam...",US,2015-03-09 22:27:52,2015-06-27 03:59:00,2500.0,2037776650,False,2015-05-28 02:02:43,The Good Parts of Being Alive - Boston Center for the Arts,True,successful,1.000000,2636.000000


In [68]:
#see a first example of duplicates by 'id'
df[df['id'] == 1484057434]

Unnamed: 0,backers_count,blurb,category,country,created_at,deadline,goal,id,is_starrable,launched_at,name,staff_pick,state,static_usd_rate,usd_pledged
2,267,Wonderful 3D Popup Christmas Cards. Opening this greeting card reveals a brilliantly crafted scu...,"{""id"":351,""name"":""Printing"",""slug"":""crafts/printing"",""position"":9,""parent_id"":26,""parent_name"":""...",DE,2015-10-08 07:24:00,2015-11-08 21:00:00,2500.0,1484057434,False,2015-10-16 01:09:33,"Wonderful 3D Popup Christmas Cards: Small, Yet Meaningful",True,successful,1.139062,11653.747824
178448,267,Wonderful 3D Popup Christmas Cards. Opening this greeting card reveals a brilliantly crafted scu...,"{""id"":351,""name"":""Printing"",""slug"":""crafts/printing"",""position"":9,""parent_id"":26,""parent_name"":""...",DE,2015-10-08 07:24:00,2015-11-08 21:00:00,2500.0,1484057434,False,2015-10-16 01:09:33,"Wonderful 3D Popup Christmas Cards: Small, Yet Meaningful",True,successful,1.139062,11653.747824


In [71]:
#see which cols are identical
df.iloc[2] == df.iloc[178448]

backers_count      True
blurb              True
category           True
country            True
created_at         True
deadline           True
goal               True
id                 True
is_starrable       True
launched_at        True
name               True
staff_pick         True
state              True
static_usd_rate    True
usd_pledged        True
dtype: bool

In [72]:
#drop duplicate rows by 'id'
df.drop_duplicates(subset='id', keep='first', inplace=True, ignore_index=True)

### extracting categories and subcategories

In [74]:
# Example category value
df.iloc[0]['category']

'{"id":356,"name":"Woodworking","slug":"crafts/woodworking","position":14,"parent_id":26,"parent_name":"Crafts","color":16744876,"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/crafts/woodworking"}}}'

In [75]:
# Extracting the relevant sub-category section from the string
f = lambda x: x['category'].split('/')[1].split('","position')[0]
df['sub_category'] = df.apply(f, axis=1)

# Extracting the relevant category section from the string, and replacing the original category variable
f = lambda x: x['category'].split('"slug":"')[1].split('/')[0]
df['category'] = df.apply(f, axis=1)
f = lambda x: x['category'].split('","position"')[0] # Some categories do not have a sub-category, so do not have a '/' to split with
df['category'] = df.apply(f, axis=1)

### calculating additional features

In [None]:
# Mean pledge per backer
df['pledge_per_backer'] = round(df['usd_pledged']/df['backers_count'],2)

In [None]:
# Calculate new column 'usd_goal' as goal * static_usd_rate
df['usd_goal'] = round(df['goal'] * df['static_usd_rate'],2)

In [None]:
# Time between creating and launching a project
df['creation_to_launch_days'] = df['launched_at'] - df['created_at']
df['creation_to_launch_days'] = df['creation_to_launch_days'].dt.round('d').dt.days # Rounding to nearest days, then showing as number only
# Or could show as number of hours:
# df['creation_to_launch_hours'] = df['launched_at'] - df['created_at']
# df['creation_to_launch_hours'] = df['creation_to_launch_hours'].dt.round('h') / np.timedelta64(1, 'h') 

# Campaign length
df['campaign_days'] = df['deadline'] - df['launched_at']
df['campaign_days'] = df['campaign_days'].dt.round('d').dt.days # Rounding to nearest days, then showing as number only

# Launch day of week
df['launch_day'] = df['launched_at'].dt.day_name()

# Deadline day of week
df['deadline_day'] = df['deadline'].dt.day_name()

# Launch month
df['launch_month'] = df['launched_at'].dt.month_name()

# Deadline month
df['deadline_month'] = df['deadline'].dt.month_name()

In [None]:
# Launch time
df['launch_hour'] = df['launched_at'].dt.hour # Extracting hour from launched_at

def two_hour_launch(row):
    '''Creates two hour bins from the launch_hour column'''
    if row['launch_hour'] in (0,1):
        return '12am-2am'
    if row['launch_hour'] in (2,3):
        return '2am-4am'
    if row['launch_hour'] in (4,5):
        return '4am-6am'
    if row['launch_hour'] in (6,7):
        return '6am-8am'
    if row['launch_hour'] in (8,9):
        return '8am-10am'
    if row['launch_hour'] in (10,11):
        return '10am-12pm'
    if row['launch_hour'] in (12,13):
        return '12pm-2pm'
    if row['launch_hour'] in (14,15):
        return '2pm-4pm'
    if row['launch_hour'] in (16,17):
        return '4pm-6pm'
    if row['launch_hour'] in (18,19):
        return '6pm-8pm'
    if row['launch_hour'] in (20,21):
        return '8pm-10pm'
    if row['launch_hour'] in (22,23):
        return '10pm-12am'
    
df['launch_time'] = df.apply(two_hour_launch, axis=1) # Calculates bins from launch_time

df.drop('launch_hour', axis=1, inplace=True)

In [None]:
# Deadline time
df['deadline_hour'] = df['deadline'].dt.hour # Extracting hour from deadline

def two_hour_deadline(row):
    '''Creates two hour bins from the deadline_hour column'''
    if row['deadline_hour'] in (0,1):
        return '12am-2am'
    if row['deadline_hour'] in (2,3):
        return '2am-4am'
    if row['deadline_hour'] in (4,5):
        return '4am-6am'
    if row['deadline_hour'] in (6,7):
        return '6am-8am'
    if row['deadline_hour'] in (8,9):
        return '8am-10am'
    if row['deadline_hour'] in (10,11):
        return '10am-12pm'
    if row['deadline_hour'] in (12,13):
        return '12pm-2pm'
    if row['deadline_hour'] in (14,15):
        return '2pm-4pm'
    if row['deadline_hour'] in (16,17):
        return '4pm-6pm'
    if row['deadline_hour'] in (18,19):
        return '6pm-8pm'
    if row['deadline_hour'] in (20,21):
        return '8pm-10pm'
    if row['deadline_hour'] in (22,23):
        return '10pm-12am'
    
df['deadline_time'] = df.apply(two_hour_deadline, axis=1) # Calculates bins from launch_time

df.drop('deadline_hour', axis=1, inplace=True)

### setting the index

In [None]:
# Setting the id column as the index
df.set_index('id', inplace=True)

### nlp on blurb

In [None]:
df.info()

In [None]:
df.loc[df.index.isin([435934027,  157410178,  653744949,  744433500, 1184885399,
       1879626727, 2019587383,  536379227, 1117024940,  553865404,
        620799971,  287267149, 1740554502, 1451290368,  641283724,
       1052698623, 1331733830,  123163859,  154884726, 1451290368,
       1694130363, 1052698623, 1717313398,  727243742, 1711857183,
       1550219993, 1098927674, 1586181837, 1514280939, 1480492870,
       1680012250,  721106264, 1680012250, 1903053501, 1093194527,
       1586181837,  256267171,  706466606, 2056420743, 1058795815,
       1521459466,  123163859, 1619448283, 1823563991,  224219885,
        744433500, 1357350078, 1858475895,  856311803,    3836669,
       1824277059, 1550219993,  694587883,   23381768,  608070246,
       1077718072, 1703062074,  553865404,  716651778, 1740554502])]

In [None]:
df.drop([435934027,  157410178,  653744949,  744433500, 1184885399,
       1879626727, 2019587383,  536379227, 1117024940,  553865404,
        620799971,  287267149, 1740554502, 1451290368,  641283724,
       1052698623, 1331733830,  123163859,  154884726, 1451290368,
       1694130363, 1052698623, 1717313398,  727243742, 1711857183,
       1550219993, 1098927674, 1586181837, 1514280939, 1480492870,
       1680012250,  721106264, 1680012250, 1903053501, 1093194527,
       1586181837,  256267171,  706466606, 2056420743, 1058795815,
       1521459466,  123163859, 1619448283, 1823563991,  224219885,
        744433500, 1357350078, 1858475895,  856311803,    3836669,
       1824277059, 1550219993,  694587883,   23381768,  608070246,
       1077718072, 1703062074,  553865404,  716651778, 1740554502], inplace=True)

In [None]:
# special_characters removal
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def remove_punctuation_and_splchars(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_word = remove_special_characters(new_word, True)
            new_words.append(new_word)
    return new_words

# def replace_numbers(words):
#     """Replace all integer occurrences in list of tokenized words with textual representation"""
#     p = inflect.engine()
#     new_words = []
#     for word in words:
#         if word.isdigit():
#             new_word = p.number_to_words(word)
#             new_words.append(new_word)
#         else:
#             new_words.append(word)
#     return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopword_list:
            new_words.append(word)
    return new_words

# def stem_words(words):
#     """Stem words in list of tokenized words"""
#     stemmer = LancasterStemmer()
#     stems = []
#     for word in words:
#         stem = stemmer.stem(word)
#         stems.append(stem)
#     return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation_and_splchars(words)
    words = remove_stopwords(words)
    return words

def lemmatize(words):
    lemmas = lemmatize_verbs(words)
    return lemmas

In [None]:
stopword_list= stopwords.words('english')

In [None]:
def normalize_and_lemmatize(input):
    sample = remove_special_characters(input)
    words = nltk.word_tokenize(sample)
    words = normalize(words)
    lemmas = lemmatize(words)
    return ' '.join(lemmas)

In [None]:
df = df.dropna(axis=0, subset=['blurb'])

In [None]:
df['clean_text'] = df['blurb'].map(lambda text: normalize_and_lemmatize(text))

### target

In [None]:
df = df[df['state'].isin(['successful', 'failed'])]

In [None]:
# Dropping goal, static_usd_rate, blurb
df.drop(['goal', 'static_usd_rate', 'blurb'], axis=1, inplace=True)

In [None]:
df.info()

df.to_csv('processed_data.csv')

## EDA

In [None]:
# Plotting the average amount pledged to successful and unsuccesful projects
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(12,12))

df['state'].value_counts(ascending=True).plot(kind='bar', ax=ax1, color=['firebrick', 'seagreen'], rot=0)
ax1.set_title('Number of projects')
ax1.set_xlabel('')

df.groupby('state').usd_goal.median().plot(kind='bar', ax=ax2, color=['firebrick', 'seagreen'], rot=0)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')

df.groupby('state').usd_pledged.median().plot(kind='bar', ax=ax3, color=['firebrick', 'seagreen'], rot=0)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')

df.groupby('state').backers_count.median().plot(kind='bar', ax=ax4, color=['firebrick', 'seagreen'], rot=0)
ax4.set_title('Median backers per project')
ax4.set_xlabel('')

df.groupby('state').campaign_days.mean().plot(kind='bar', ax=ax5, color=['firebrick', 'seagreen'], rot=0)
ax5.set_title('Mean campaign length (days)')
ax5.set_xlabel('')

df.groupby('state').creation_to_launch_days.mean().plot(kind='bar', ax=ax6, color=['firebrick', 'seagreen'], rot=0)
ax6.set_title('Mean creation to launch length (days)')
ax6.set_xlabel('')

# df.groupby('state').name_length.mean().plot(kind='bar', ax=ax7, color=['firebrick', 'seagreen'], rot=0)
# ax7.set_title('Mean name length (words)')
# ax7.set_xlabel('')

# df.groupby('state').blurb_length.mean().plot(kind='bar', ax=ax8, color=['firebrick', 'seagreen'], rot=0)
# ax8.set_title('Mean blurb length (words)')
# ax8.set_xlabel('')

# # Creating a dataframe grouped by staff_pick with columns for failed and successful
# pick_df = pd.get_dummies(df.set_index('staff_pick').state).groupby('staff_pick').sum()
# # Normalizes counts by column, and selects the 'True' category (iloc[1])
# (pick_df.div(pick_df.sum(axis=0), axis=1)).iloc[1].plot(kind='bar', ax=ax9, color=['firebrick', 'seagreen'], rot=0) 
# ax9.set_title('Proportion that are staff picks')
# ax9.set_xlabel('')

fig.subplots_adjust(hspace=0.3)
plt.show()

In [None]:
# Creating a dataframe grouped by category with columns for failed and successful
cat_df = pd.get_dummies(df.set_index('category').state).groupby('category').sum()

# Plotting
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,12))

color = cm.CMRmap(np.linspace(0.1,0.8,df.category.nunique())) # Setting a colormap

df.groupby('category').category.count().plot(kind='bar', ax=ax1, color=color)
ax1.set_title('Number of projects')
ax1.set_xlabel('')

df.groupby('category').usd_goal.median().plot(kind='bar', ax=ax2, color=color)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')

df.groupby('category').usd_pledged.median().plot(kind='bar', ax=ax3, color=color)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')

cat_df.div(cat_df.sum(axis=1), axis=0).successful.plot(kind='bar', ax=ax4, color=color) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')

df.groupby('category').backers_count.median().plot(kind='bar', ax=ax5, color=color)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')

df.groupby('category').pledge_per_backer.median().plot(kind='bar', ax=ax6, color=color)
ax6.set_title('Median pledged per backer ($)')
ax6.set_xlabel('')

fig.subplots_adjust(hspace=0.6)
plt.show()

In [None]:
# Creating a dataframe grouped by the day on which they were launched, with columns for failed and successful
day_df = pd.get_dummies(df.set_index('launch_day').state).groupby('launch_day').sum()

# Plotting
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(14,12))

color = cm.CMRmap(np.linspace(0.1,0.8,df.launch_day.nunique()))

weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df.groupby('launch_day').launch_day.count().reindex(weekdays).plot(kind='bar', ax=ax1, color=color, rot=0)
ax1.set_title('Number of projects launched')
ax1.set_xlabel('')

df.groupby('launch_day').usd_goal.median().reindex(weekdays).plot(kind='bar', ax=ax2, color=color, rot=0)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')

df.groupby('launch_day').usd_pledged.median().reindex(weekdays).plot(kind='bar', ax=ax3, color=color, rot=0)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')

day_df.div(day_df.sum(axis=1), axis=0).successful.reindex(weekdays).plot(kind='bar', ax=ax4, color=color, rot=0) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')

df.groupby('launch_day').backers_count.median().reindex(weekdays).plot(kind='bar', ax=ax5, color=color, rot=0)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')

df.groupby('launch_day').pledge_per_backer.median().reindex(weekdays).plot(kind='bar', ax=ax6, color=color, rot=0)
ax6.set_title('Median pledged per backer ($)')
ax6.set_xlabel('')

fig.subplots_adjust(hspace=0.3)
plt.show()

In [None]:
# Creating a dataframe grouped by the month in which they were launched, with columns for failed and successful
month_df = pd.get_dummies(df.set_index('launch_month').state).groupby('launch_month').sum()

# Plotting
months = list(calendar.month_name)[1:]

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(14,12))

color = cm.CMRmap(np.linspace(0.1,0.8,df.launch_month.nunique()))

df.groupby('launch_month').launch_month.count().reindex(months).plot(kind='bar', ax=ax1, color=color, rot=45)
ax1.set_title('Number of projects launched')
ax1.set_xlabel('')
ax1.set_xticklabels(labels=ax1.get_xticklabels(), ha='right')

df.groupby('launch_month').usd_goal.median().reindex(months).plot(kind='bar', ax=ax2, color=color, rot=45)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')
ax2.set_xticklabels(labels=ax2.get_xticklabels(), ha='right')

df.groupby('launch_month').usd_pledged.median().reindex(months).plot(kind='bar', ax=ax3, color=color, rot=45)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')
ax3.set_xticklabels(labels=ax3.get_xticklabels(), ha='right')

month_df.div(month_df.sum(axis=1), axis=0).successful.reindex(months).plot(kind='bar', ax=ax4, color=color, rot=45) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')
ax4.set_xticklabels(labels=ax4.get_xticklabels(), ha='right')

df.groupby('launch_month').backers_count.median().reindex(months).plot(kind='bar', ax=ax5, color=color, rot=45)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')
ax5.set_xticklabels(labels=ax5.get_xticklabels(), ha='right')

df.groupby('launch_month').pledge_per_backer.median().reindex(months).plot(kind='bar', ax=ax6, color=color, rot=45)
ax6.set_title('Median pledged per backer ($)')
ax6.set_xlabel('')
ax6.set_xticklabels(labels=ax6.get_xticklabels(), ha='right')

fig.subplots_adjust(hspace=0.4)
plt.show()

In [None]:
# Creating a dataframe grouped by the time at which they were launched, with columns for failed and successful
time_df = pd.get_dummies(df.set_index('launch_time').state).groupby('launch_time').sum()

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(14,12))

color = cm.CMRmap(np.linspace(0.1,0.8,df.launch_time.nunique()))

times = ['12am-2am', '2am-4am', '4am-6am', '6am-8am', '8am-10am', '10am-12pm', '12pm-2pm', '2pm-4pm', '4pm-6pm', '6pm-8pm', '8pm-10pm', '10pm-12am']

df.groupby('launch_time').launch_time.count().reindex(times).plot(kind='bar', ax=ax1, color=color, rot=45)
ax1.set_title('Number of projects launched')
ax1.set_xlabel('')
ax1.set_xticklabels(labels=ax1.get_xticklabels(), ha='right')

df.groupby('launch_time').usd_goal.median().reindex(times).plot(kind='bar', ax=ax2, color=color, rot=45)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')
ax2.set_xticklabels(labels=ax2.get_xticklabels(), ha='right')

df.groupby('launch_time').usd_pledged.median().reindex(times).plot(kind='bar', ax=ax3, color=color, rot=45)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')
ax3.set_xticklabels(labels=ax3.get_xticklabels(), ha='right')

time_df.div(time_df.sum(axis=1), axis=0).successful.reindex(times).plot(kind='bar', ax=ax4, color=color, rot=45) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')
ax4.set_xticklabels(labels=ax4.get_xticklabels(), ha='right')

df.groupby('launch_time').backers_count.median().reindex(times).plot(kind='bar', ax=ax5, color=color, rot=45)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')
ax5.set_xticklabels(labels=ax5.get_xticklabels(), ha='right')

df.groupby('launch_time').pledge_per_backer.median().reindex(times).plot(kind='bar', ax=ax6, color=color, rot=45)
ax6.set_title('Median pledged per backer ($)')
ax6.set_xlabel('')
ax6.set_xticklabels(labels=ax6.get_xticklabels(), ha='right')

fig.subplots_adjust(hspace=0.45)
plt.show()

## class imbalance

In [None]:
df['state'].value_counts()

In [None]:
df_state_ = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [None]:
fig,ax=plt.subplots()
ax.hist(df_state_['state']);