## Imports and reading the data

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
raw_df = pd.read_csv('../data/all_project_week_19_32.csv')
cols = ['project_slug', 'Category', 'Location', 'Goal_USD', 'Pledge_USD', 'Number_Backers', 'Creator_nb_projects','Project_Community_top_countries','Project_description', 'Deadline', 'Launched_at' ]
raw_df = raw_df[cols]
raw_df = raw_df.dropna()

## Filtering projects that are still running when the data collection happened (2019 aug. 11)

In [3]:
# getting rid on unix timestamps
raw_df['end_date'] = pd.to_datetime(raw_df['Deadline'], unit='s', origin='unix')
raw_df['launch_date'] = pd.to_datetime(raw_df['Launched_at'], unit='s', origin='unix')

# Exploring how the launch- and end-dates look like in the dataset
print('earliest launch date: ', raw_df['launch_date'].min())
print('earliest deadline: ', raw_df['end_date'].min())
print('latest launch date: ', raw_df['launch_date'].max())
print('latest deadline: ', raw_df['end_date'].max())


earliest launch date:  2009-05-06 16:20:07
earliest deadline:  2009-06-24 22:26:00
latest launch date:  2019-07-30 10:03:18
latest deadline:  2019-09-28 08:11:41


In [4]:
# filtering on date of collection
cutoff_date = pd.Timestamp(year=2019, month=8, day=11, hour=0)
raw_df = raw_df[ raw_df['end_date'] < cutoff_date ]
raw_df = raw_df.drop(columns=['Deadline', 'Launched_at', 'launch_date', 'end_date'])
raw_df.columns

Index(['project_slug', 'Category', 'Location', 'Goal_USD', 'Pledge_USD',
       'Number_Backers', 'Creator_nb_projects',
       'Project_Community_top_countries', 'Project_description'],
      dtype='object')

## Creating target variable and adjusting other attributes

### Gettint the top countries by backers

In [5]:
raw_df['Project_Community_top_countries'][0]

'|United States:57 backers|Australia:1 backer|Singapore:1 backer|United Kingdom:1 backer'

In [6]:
def get_top_country(text):
    z = re.search(r'(?<=\|)(.*?)(?=\:)', text)
    if z is not None:
        return z.group()
    else: return ''
get_top_country('|United States:57 backers|Australia:1 backer|Singapore:1 backer|United Kingdom:1 backer')

'United States'

In [9]:
# getting the top country to use as exo variable for STM
raw_df['top_country'] = raw_df.apply( lambda row: get_top_country(row['Project_Community_top_countries']), axis=1)
raw_df['top_country'].head()

0    United States
1    United States
2    United States
3    United States
4    United States
Name: top_country, dtype: object

### Calculating the percentage of amount actually pledged

## Slicing dataset up on meta-categories

In [5]:
# Defining meta-categories according to kickstarter standards
art_meta = ['Art', 'Ceramics', 'Conceptual Art', 'Digital Art', 'Illustration', 'Installations', 'Mixed Media', 'Painting', 'Performance Art', 'Public Art', 'Sculpture', 'Social Practice', 'Textiles', 'Video Art']
comics_meta = ['Comics', 'Comic Books', 'Graphic Novels', 'Webcomics']
crafts_meta = ['Crafts', 'Candles', 'Crochet', 'DIY', 'Embroidery', 'Glass', 'Knitting', 'Pottery', 'Printing', 'Quilts', 'Stationery', 'Taxidermy', 'Weaving', 'Woodworking']
dance_meta = ['Dance', 'Performances', 'Residencies', 'Workshops']
design_meta = ['Design', 'Architecture', 'Civic Design', 'Graphic Design', 'Interactive Design', 'Product Design', 'Toys', 'Typography']
fashion_meta =['Fashion', 'Accessories', 'Apparel', 'Childrenswear', 'Couture', 'Footwear', 'Jewelry', 'Pet Fashion', 'Ready-to-wear']
film_meta = ['Film', 'Film &amp; Video', 'Action', 'Animation', 'Comedy', 'Documentary', 'Drama', 'Experimental', 'Family', 'Fantasy', 'Festivals', 'Horror', 'Movie Theaters', 'Music Videos', 'Narrative Film', 'Romance', 'Science Fiction', 'Shorts', 'Television', 'Thrillers', 'Webseries']
food_meta = ['Food', 'Bacon', 'Community Gardens', 'Cookbooks', 'Drinks', 'Farms', 'Food Trucks', 'Food Trucks', 'Restaurants', 'Small Batch', 'Spaces', 'Vegan']
games_meta = ['Games', 'Gaming Hardware', 'Live Games', 'Mobile Games', 'Playing Cards', 'Puzzles', 'Tabletop Games', 'Video Games']
journalism_meta = ['Journalism', 'Audio', 'Photo', 'Print', 'Video', 'Web']
music_meta = ['Music', 'Blues', 'Chiptune', 'Classical Music', 'Comedy', 'Country &amp; Folk', 'Electronic Music', 'Faith', 'Hip-Hop', 'Indie Rock', 'Jazz', 'Kids', 'Latin', 'Metal', 'Pop', 'Punk', 'R&amp;B', 'Rock', 'World Music']
photography_meta = ['Photography', 'Animals', 'Fine Art', 'Nature', 'People', 'Photobooks', 'Places']
publishing_meta = ['Publishing', 'Academic', 'Art Books', 'Calendars', 'Children&#39;s Books', 'Comedy', 'Fiction', 'Letterpress', 'Literary Journals', 'Literary Spaces', 'Nonfiction', 'Periodicals', 'Poetry', 'Radio &amp; Podcasts', 'Translations', 'Young Adult', 'Zines']
technology_meta = ['Technology', '3D Printing', 'Apps', 'Camera Equipment', 'DIY Electronics', 'Fabrication Tools', 'Flight', 'Gadgets', 'Hardware', 'Makerspaces', 'Robots', 'Software', 'Sound', 'Space Exploration', 'Wearables', 'Web']
theater_meta = ['Theater','Comedy', 'Experimental', 'Festivals', 'Immersive', 'Musical', 'Plays']

meta_categories = [art_meta, comics_meta, crafts_meta, dance_meta, design_meta, fashion_meta, film_meta, food_meta, games_meta, journalism_meta, music_meta, photography_meta, publishing_meta, technology_meta, theater_meta]
cat_df_dict = {}
for category in meta_categories:
  df_1 = raw_df[raw_df['Category'].isin(category)]
  cat_df_dict.update( {str(category[0]): df_1} )

In [6]:
# exploring category numbers
for key in cat_df_dict:
  print(key + ' category has ' + str(len(cat_df_dict[key])) + ' rows')

Art category has 8916 rows
Comics category has 1925 rows
Crafts category has 1728 rows
Dance category has 1682 rows
Design category has 3588 rows
Fashion category has 2112 rows
Film category has 8824 rows
Food category has 1761 rows
Games category has 5165 rows
Journalism category has 1011 rows
Music category has 9651 rows
Photography category has 1777 rows
Publishing category has 6677 rows
Technology category has 4089 rows
Theater category has 1707 rows


## Subsampling dataset
According to [statista.com](https://www.statista.com/statistics/222455/amount-of-dollars-pledged-per-category-on-kickstarter/), the three most funded categories were *Games*, *Design*, and *Technology*, so we will focus our efforts on these. We will test descriptive stats in R to figure out the best subsampling strategy.

In [7]:
games_df = cat_df_dict['Games']
games_df.to_csv('../data/raw_games_data.csv')
design_df = cat_df_dict['Design']
design_df.to_csv('../data/raw_design_data.csv')
tech_df = cat_df_dict['Technology']
tech_df.to_csv('../data/raw_tech_data.csv')

## Text preprocessing

The preprocessing will be done using the following function but we run it on a vm to make it faster.

In [41]:
def preprocess(text):
    if isinstance(text, str):
        lemmatizer = WordNetLemmatizer()
        text = text.strip()
        result = ""
        punctless = re.sub('[^a-zA-Z0-9]', ' ', text)
        lowercase = punctless.lower()
        words_list = lowercase.split()
        words_list =  [ lemmatizer.lemmatize(word) for word in words_list if not word in set(stopwords.words('english')) ]
        for word in words_list:
            result += " " + word.strip()
        return result
    else: return ''