# Manually selecting usefull attributes

For our prediction task not all data we have obtained by data integration can be used. We have to manually select static and dynamic data and remove:

* **future data** = data that might be changed after end of a project's campaign or were not available at the time of the first scraping
* **redundant/duplicate data** = data contained in both datasets
* **useless data** = variables with single value (e.g. for all observations 'spotlight' is False), ids, names, json data, etc

In [1]:
import pandas as pd
import json
from pprint import pprint

In [2]:
df = pd.read_csv('data/merged4.csv', encoding='latin1', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.info(max_cols=120)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30702 entries, 0 to 30701
Data columns (total 113 columns):
average_pledge                           30702 non-null float64
average_pledge_end                       30702 non-null float64
backers_count                            30702 non-null int64
backers_count_end                        30702 non-null int64
blurb                                    30702 non-null object
blurb_end                                30702 non-null object
category                                 30702 non-null object
category_end                             30702 non-null object
category_name                            30702 non-null object
category_name_end                        30702 non-null object
category_parent                          30702 non-null object
category_parent_end                      30702 non-null object
category_position                        30702 non-null int64
category_position_end                    30702 non-null int64
country   

In [4]:
static_data = []
dynamic_data = []
useless_data = []

## Attributes (columns)

### 1. average_pledge

In [5]:
dynamic_data.append('average_pledge')

### 2. average_pledge_end

In [6]:
# future data
useless_data.append('average_pledge_end')

### 3. backers_count

In [7]:
dynamic_data.append('backers_count')

### 4. backers_count_end

In [8]:
# future data
useless_data.append('backers_count_end')

### 5. blurb

In [9]:
static_data.append('blurb')

### 6. blurb_end

In [10]:
# future data
useless_data.append('blurb_end')

### 7. category

In [11]:
df.loc[1,'category']

'{"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/dance/performances"}},"color":10917369,"parent_id":6,"name":"Performances","id":254,"position":1,"slug":"dance/performances"}'

In [12]:
# wrong format, useful data already extracted
useless_data.append('category')

### 8. category_end

In [13]:
# future data
useless_data.append('category_end')

### 9. category_name

In [14]:
static_data.append('category_name')

### 10. category_name_end

In [15]:
# future data
useless_data.append('category_name_end')

### 11. category_parent

In [16]:
static_data.append('category_parent')

### 12. category_parent_end

In [17]:
# future data
useless_data.append('category_parent_end')

### 13. category_position (= position of category in parent category)

In [18]:
df.category_position.value_counts()

4     4103
3     3113
1     2833
2     2588
5     2559
7     2433
10    2024
6     1822
9     1693
14    1381
8     1360
11    1311
12     938
13     751
15     698
18     514
17     260
19     224
16      97
Name: category_position, dtype: int64

In [19]:
# cookbooks are 3rd in category Food (see kickstarter)
df[df.category_name == 'Cookbooks']['category_position'].value_counts()

3    184
Name: category_position, dtype: int64

In [20]:
# only Film & Video parent category contains 19 categories (19th is Webseries)
df[df.category_position == 19]['category_name'].value_counts()

Webseries    224
Name: category_name, dtype: int64

In [21]:
static_data.append('category_position')

### 14. category_position_end

In [22]:
# future data
useless_data.append('category_position_end')

### 15. country

In [23]:
static_data.append('country')

### 16. created_at

In [24]:
# timestamp, already used to extract days_preparation
# not commonly accesible to public on website, therefore should not affect behavior
useless_data.append('created_at')

### 17. creator

In [25]:
df.loc[1, 'creator']

'{"urls":{"web":{"user":"https://www.kickstarter.com/profile/1221271849"},"api":{"user":"https://api.kickstarter.com/v1/users/1221271849?signature=1446480213.130b2f9d9454277a259bf219e9b6dea39bf2ac2f"}},"name":"Cassandra Kemper","id":1221271849,"avatar":{"small":"https://ksr-ugc.imgix.net/avatars/18458965/10513971_562158447259130_1760610303_n.original.jpg?v=1445003326&w=80&h=80&fit=crop&auto=format&q=92&s=2ed4c6c88ff74c2750dbdeae1d34b953","thumb":"https://ksr-ugc.imgix.net/avatars/18458965/10513971_562158447259130_1760610303_n.original.jpg?v=1445003326&w=40&h=40&fit=crop&auto=format&q=92&s=fb2f0e15152b11832b8e80706ce01890","medium":"https://ksr-ugc.imgix.net/avatars/18458965/10513971_562158447259130_1760610303_n.original.jpg?v=1445003326&w=160&h=160&fit=crop&auto=format&q=92&s=043c24006839e18fadfbe74e7a5769a4"}}'

In [26]:
# nothing usefull here, wrong format
useless_data.append('creator')

### 18. creator_end

In [27]:
# nothing usefull here, wrong format
useless_data.append('creator_end')

### 19. creator_name

In [28]:
# useless
useless_data.append('creator_name')

### 20. creator_name_end

In [29]:
# useless
useless_data.append('creator_name_end')

### 21. currency

In [30]:
static_data.append('currency')

### 22. currency_symbol

In [31]:
df.currency_symbol.value_counts()

$      23695
?       6308
kr       544
Fr       155
Name: currency_symbol, dtype: int64

In [32]:
# damaged and redundant data
useless_data.append('currency_symbol')

### 23. currency_trailing_code

In [33]:
# useless
useless_data.append('currency_trailing_code')

### 24. days_duration

In [34]:
static_data.append('days_duration')

### 25. days_preparation

In [35]:
static_data.append('days_preparation')

### 26. days_remaining

In [36]:
dynamic_data.append('days_remaining')

### 27. days_running

In [37]:
dynamic_data.append('days_running')

### 28. db_backers_count

In [38]:
# future data
useless_data.append('db_backers_count')

### 29. db_category_id

In [39]:
# useless (numeric representation of parent category)
useless_data.append('db_category_id')

### 30. db_comments_count

In [40]:
# future data
useless_data.append('db_comments_count')

### 31. db_creators_url

In [41]:
useless_data.append('db_creators_url')

### 32. db_currency

In [42]:
# damaged redundant data
useless_data.append('db_currency')

### 33. db_description_full

In [43]:
static_data.append('db_description_full')

desc_len = df['db_description_full'].apply(lambda x: len(str(x)))

df[['db_description_full', 'db_url']].loc[14461]['db_description_full']

from bs4 import BeautifulSoup

html = BeautifulSoup(df[['db_description_full', 'db_url']].loc[14461]['db_description_full'], 'html5lib')

html.get_text()

tokens = [token for token in html.get_text().split()]
tokens[-10:]

desc_len.sort_values(ascending=False)

df['db_description_full'][30699]

### 34. db_description_short

In [44]:
df[df['blurb_end'] != df['db_description_short']][['blurb','db_description_short']].sample(5)

Unnamed: 0,blurb,db_description_short
11439,I have big ideas for 2016! But making art is a...,I have big ideas for 2016! But making art is a...
4464,"""la herida invisible"" es una colección de piez...",&quot;la herida invisible&quot; es una colecci...
30601,World's first solar backpack with detachable 6...,World&#39;s first solar backpack with detachab...
10887,A brash thief roams the universe with her hype...,A brash thief roams the universe with her hype...
20128,"Support Shani in finishing and promoting ""Lift...",Support Shani in finishing and promoting &quot...


In [45]:
# redundant data
useless_data.append('db_description_short')

### 35. db_duration

In [46]:
# keep due to inconsistency with days_duration
static_data.append('db_duration')

### 36. db_end_time

In [47]:
# useless
useless_data.append('db_end_time')

### 37. db_faq_count

In [48]:
# future data
useless_data.append('db_faq_count')

### 38. db_fb_comments_count

In [49]:
df['db_fb_comments_count'].describe()

count    30702.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: db_fb_comments_count, dtype: float64

In [50]:
# always 0
useless_data.append('db_fb_comments_count')

### 39. db_fb_shares_count

In [51]:
# future data
useless_data.append('db_fb_shares_count')

### 40. db_goal

In [52]:
len(df[df['goal'] != df['db_goal']])

0

In [53]:
# redundant data
useless_data.append('db_goal')

### 41. db_hours_remaining

In [54]:
# useless data
useless_data.append('db_hours_remaining')

### 42. db_image_url TODO transform

In [55]:
static_data.append('db_image_url')

### 43. db_location_id

In [56]:
len(df['db_location_id'].value_counts())

5139

In [57]:
# useless (ordinal representation of other columns)
useless_data.append('db_location_id')

### 44. db_name

In [58]:
# useless
useless_data.append('db_name')

### 45. db_pledged

In [59]:
# future data
useless_data.append('db_pledged')

### 46. db_project_data

In [60]:
row = 1
dict_data = json.loads(df.loc[row, 'db_project_data'])
#pprint(dict_data)

In [61]:
# usefull data already extracted
useless_data.append('db_project_data')

### 47. db_project_id

In [62]:
# useless
useless_data.append('db_project_id')

### 48. db_project_we_love

In [63]:
# future data
useless_data.append('db_project_we_love')

### 49. db_start_time

In [64]:
# useless
useless_data.append('db_start_time')

### 50. db_status

In [65]:
# useless, we use our own classes
useless_data.append('db_status')

### 51. db_subcategory_id

In [66]:
# ordinal representation of category_name
useless_data.append('db_subcategory_id')

### 52. db_updates_count

In [67]:
# future data
useless_data.append('db_updates_count')

### 53. db_url

In [68]:
# useless
useless_data.append('db_url')

### 54. db_video_url TODO transform

In [69]:
static_data.append('db_video_url')

### 55. deadline_end  TODO extract features (+ add analysis)

In [70]:
# already used to extract days_duration and days_remaining
static_data.append('deadline_end')

### 56. disable_communication

In [71]:
df['disable_communication'].describe()

count     30702
unique        1
top       False
freq      30702
Name: disable_communication, dtype: object

In [72]:
# always False
useless_data.append('disable_communication')

### 57. disable_communication_end

In [73]:
# future data
useless_data.append('disable_communication_end')

### 58. goal

In [74]:
static_data.append('goal')

### 59. id

In [75]:
# useless
useless_data.append('id')

### 60. launched_at

In [76]:
# already used to extract days_preparation and days_running
static_data.append('launched_at')

### 61. location

In [77]:
# already used to extract data
useless_data.append('location')

### 62. location_end

In [78]:
# future data
useless_data.append('location_end')

### 63. location_is_root

In [79]:
df.location_is_root.describe()

count     30702
unique        1
top       False
freq      30702
Name: location_is_root, dtype: object

In [80]:
# always False
useless_data.append('location_is_root')

### 64. location_name

In [81]:
static_data.append('location_name')

### 65. location_name_end

In [82]:
# future data
useless_data.append('location_name_end')

### 66. location_state

In [83]:
static_data.append('location_state')

### 67. location_state_end

In [84]:
# future data
useless_data.append('location_state_end')

### 68. location_type

In [85]:
static_data.append('location_type')

### 69. location_type_end

In [86]:
# future data
useless_data.append('location_type_end')

### 70. name

In [87]:
# useless
useless_data.append('name')

### 71. name_end

In [88]:
# useless
useless_data.append('name_end')

### 72. photo

In [89]:
df.loc[1,'photo']

'{"small":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=160&h=120&fit=crop&auto=format&q=92&s=024364bb83f379e11d0ef29eafd2bb3a","1536x1152":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=1536&h=1152&fit=crop&auto=format&q=92&s=30664a3df037a1672f4cd849f298ed5a","thumb":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=40&h=30&fit=crop&auto=format&q=92&s=5556d33ffad29a3ff8f0e676a0ebeaeb","1024x768":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=1024&h=768&fit=crop&auto=format&q=92&s=0baef5145af0d44caaebdffcecad8709","med":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=266&h=200&fit=crop&auto=format&q=92&s=0e6f16d2bc45ce1925bc6661c340ced2","key":"projects/2137299/photo-original.jpg","ed":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=338&h=250&fit=crop&auto=format&q=92&s=f323d7eb379b5b563b9ef7e262fda6cd","fu

In [90]:
# wrong format, describes only 1 photo - redundant data
useless_data.append('photo')

### 73. photo_end

In [91]:
# useless, redundant
useless_data.append('photo_end')

### 74. pledged

In [92]:
dynamic_data.append('pledged')

### 75. pledged_end

In [93]:
# future data
useless_data.append('pledged_end')

### 76. profile

In [94]:
df.loc[1, 'profile']

'{"background_image_opacity":0.8,"link_text_color":null,"state_changed_at":1444938006,"should_show_feature_image":true,"blurb":null,"background_color":null,"project_id":2176873,"name":null,"feature_image_attributes":{"image_urls":{"default":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=1536&h=1152&fit=crop&auto=format&q=92&s=30664a3df037a1672f4cd849f298ed5a","baseball_card":"https://ksr-ugc.imgix.net/projects/2137299/photo-original.jpg?v=1444938200&w=1536&h=1152&fit=crop&auto=format&q=92&s=30664a3df037a1672f4cd849f298ed5a"}},"link_url":null,"show_feature_image":false,"id":2176873,"state":"inactive","text_color":null,"link_text":null,"link_background_color":null}'

In [95]:
# wrong format, useless
useless_data.append('profile')

### 77. profile_end

In [96]:
# future data
useless_data.append('profile_end')

### 78. ratio_pledged_end_goal

In [97]:
# already used to extract class
useless_data.append('ratio_pledged_end_goal')

### 79. ratio_pledged_goal

In [98]:
dynamic_data.append('ratio_pledged_goal')

### 80. ratio_running_duration

In [99]:
dynamic_data.append('ratio_running_duration')

### 81. scraped_at

In [100]:
# already used to extract many features
useless_data.append('scraped_at')

### 82. slug

In [101]:
# useless
useless_data.append('slug')

### 83. source_url

In [102]:
# useless
useless_data.append('source_url')

### 84. source_url_end

In [103]:
# useless
useless_data.append('source_url_end')

### 85. spotlight

In [104]:
df.spotlight.describe()

count     30702
unique        1
top       False
freq      30702
Name: spotlight, dtype: object

In [105]:
# always False
useless_data.append('spotlight')

### 86. spotlight_end

In [106]:
# future data
useless_data.append('spotlight_end')

### 87. staff_pick

In [107]:
dynamic_data.append('staff_pick')

### 88. staff_pick_end

In [108]:
# future data
useless_data.append('staff_pick_end')

### 89. state

In [109]:
df.state.describe()

count     30702
unique        1
top        live
freq      30702
Name: state, dtype: object

In [110]:
# always live
useless_data.append('state')

### 90. state_changed_at

In [111]:
# useless
useless_data.append('state_changed_at')

### 91. state_changed_at_end

In [112]:
# useless, future data
useless_data.append('state_changed_at_end')

### 92. state_end

In [113]:
# future data, we have our own prediction class
useless_data.append('state_end')

### 93. static_usd_rate

In [114]:
# already used for currency conversion to USD
useless_data.append('static_usd_rate')

### 94. static_usd_rate_end

In [115]:
# useless, future data
useless_data.append('static_usd_rate_end')

### 95. url_name

In [116]:
# useless for prediction but can serve for project identification so we will keep it
static_data.append('url_name')

### 96. urls

In [117]:
df.loc[1, 'urls']

'{"web":{"project":"https://www.kickstarter.com/projects/1221271849/all-style-dance-battle-the-streak?ref=category","rewards":"https://www.kickstarter.com/projects/1221271849/all-style-dance-battle-the-streak/rewards"}}'

In [118]:
# useless
useless_data.append('urls')

### 97. urls_end

In [119]:
# useless
useless_data.append('urls_end')

### 98. urls_url

In [120]:
# useless
useless_data.append('urls_url')

### 99. urls_url_end

In [121]:
# useless
useless_data.append('urls_url_end')

### 100. usd_goal

In [122]:
static_data.append('usd_goal')

### 101. usd_pledged

In [123]:
dynamic_data.append('usd_pledged')

### 102. usd_pledged_end

In [124]:
# future data
useless_data.append('usd_pledged_end')

### 103. class

In [125]:
# predicted class
static_data.append('class')

### 104. ID

In [126]:
# useless
useless_data.append('ID')

### 105. faq_count_while_scraping

In [127]:
dynamic_data.append('faq_count_while_scraping')

### 106. comments_count_creator_while_scraping

In [128]:
dynamic_data.append('comments_count_creator_while_scraping')

### 107. comments_count_public_while_scraping

In [129]:
dynamic_data.append('comments_count_public_while_scraping')

### 108. updates_count_while_scraping

In [130]:
dynamic_data.append('updates_count_while_scraping')

### 109. updates_likes_sum_while_scraping

In [131]:
dynamic_data.append('updates_likes_sum_while_scraping')

### 110. updates_likes_mean_while_scraping

In [132]:
dynamic_data.append('updates_likes_mean_while_scraping')

### 111. updates_likes_min_while_scraping

In [133]:
dynamic_data.append('updates_likes_min_while_scraping')

### 112. updates_likes_max_while_scraping

In [134]:
dynamic_data.append('updates_likes_max_while_scraping')

### 113. updates_likes_median_while_scraping

In [135]:
dynamic_data.append('updates_likes_median_while_scraping')

## Summary

In [136]:
len(static_data)

21

In [137]:
len(dynamic_data)

18

In [138]:
len(useless_data)

74

In [139]:
static_data

['blurb',
 'category_name',
 'category_parent',
 'category_position',
 'country',
 'currency',
 'days_duration',
 'days_preparation',
 'db_description_full',
 'db_duration',
 'db_image_url',
 'db_video_url',
 'deadline_end',
 'goal',
 'launched_at',
 'location_name',
 'location_state',
 'location_type',
 'url_name',
 'usd_goal',
 'class']

In [140]:
dynamic_data

['average_pledge',
 'backers_count',
 'days_remaining',
 'days_running',
 'pledged',
 'ratio_pledged_goal',
 'ratio_running_duration',
 'staff_pick',
 'usd_pledged',
 'faq_count_while_scraping',
 'comments_count_creator_while_scraping',
 'comments_count_public_while_scraping',
 'updates_count_while_scraping',
 'updates_likes_sum_while_scraping',
 'updates_likes_mean_while_scraping',
 'updates_likes_min_while_scraping',
 'updates_likes_max_while_scraping',
 'updates_likes_median_while_scraping']

In [141]:
df_selected = df.drop(useless_data, axis=1)

In [142]:
df_selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30702 entries, 0 to 30701
Data columns (total 39 columns):
average_pledge                           30702 non-null float64
backers_count                            30702 non-null int64
blurb                                    30702 non-null object
category_name                            30702 non-null object
category_parent                          30702 non-null object
category_position                        30702 non-null int64
country                                  30702 non-null object
currency                                 30702 non-null object
days_duration                            30702 non-null int64
days_preparation                         30702 non-null int64
days_remaining                           30702 non-null int64
days_running                             30702 non-null int64
db_description_full                      30691 non-null object
db_duration                              30702 non-null int64
db_image_url   

In [143]:
df_selected.to_csv('data/selected_data.csv')