In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import os, sys
import datetime
from datetime import date
import time
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#from ggplot import *
#%matplotlib inline


plt.rcParams.update({'font.size': 22})

def initial_setup():
    """
    Create Initial setup of directories variables, and dataframe vars to use.
    Returns:
      A tuple containing:
          - datadir:   Absolute Path to the data directory of the project.
          - dirname:   Absolute Path of directory that contains this file.
          - imagesdir: Absolute path of directory that contains the images.
          - colnames: A list containing the initial colnames of the dataframe.
    """
    # Initial directories set up
    dirname = os.path.dirname(os.path.abspath('__file__'))
    datadir =  os.path.join(os.path.abspath(os.path.join(os.path.join(dirname, os.pardir), os.pardir)), 'data')
    imagesdir =  os.path.join(os.path.abspath(os.path.join(dirname, os.pardir)), 'images')
    initial_colnames = sorted(['backers_count', 'blurb', 'category', 'country', 'created_at', 'state_changed_at', 'currency', 'deadline', 'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'state', 'static_usd_rate', 'usd_pledged'])
    return dirname, datadir, imagesdir, initial_colnames

 # 0 - Initial directories and colnames set up
print("Step 0: Initial directories and colnames set up")
dirname, datadir, imagesdir, initial_colnames = initial_setup()
print("Directory of this file is %s" % dirname)
print("Data directory is %s" % datadir)
print("Images directory is %s" % imagesdir)
print("Initial columns for our model are: \n%s" % initial_colnames)

Step 0: Initial directories and colnames set up
Directory of this file is /home/agericke/crowdfunding_ml/src/formatting
Data directory is /home/agericke/crowdfunding_ml/data
Images directory is /home/agericke/crowdfunding_ml/src/images
Initial columns for our model are: 
['backers_count', 'blurb', 'category', 'country', 'created_at', 'currency', 'deadline', 'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'state', 'state_changed_at', 'static_usd_rate', 'usd_pledged']


In [2]:
def read_from_disk(filename):
    """
    Read a dataframe from a filename in disk.
    Params:
        filename....Path to the file.
    Returns:
        A pandas dataframe.
    """
    return pickle.load(open(filename, 'rb'))


def store_dataframe(dataframe, filename):
    """
    Store the dataframe using pickle.
    Params:
        dataframe...pandas dataframe to store.
        filename....Path to the file to store the datafram in.
    Returns:
        Nothing.
    """
    pickle.dump(dataframe, open(filename, 'wb'))

# 1 - Load from disk the complete Merged Dataframe.
print("\n\n\nStep 1: Load from disk the complete Merged Dataframe.")
filename = os.path.join(datadir, 'dataframe_total.pkl')
print("Completed Dataframe read from file %s" % filename)
data = read_from_disk(filename)
# Print summary of dataframe
print("Dataframe contains %d projects and %d columns for each project\n" % (data.shape[0], data.shape[1]))




Step 1: Load from disk the complete Merged Dataframe.
Completed Dataframe read from file /home/agericke/crowdfunding_ml/data/dataframe_total.pkl
Dataframe contains 344209 projects and 17 columns for each project



In [3]:
def read_dataframe(path):
    """
    Read data from the file provided and return it as a pandas dataframe.
    Params:
        path.....The absolute path to the dataframe.
    Returns:
        A pandas dataframe.
    """
    #pd.read_csv(os.path.join(datadir, "2016-05-15T020446/Kickstarter001.csv"))
    return pd.read_csv(path, encoding='ISO-8859-1')


def sort_dataframe_by_columns(dataframe):
    """
    Pick the colnames for a dataframe, and returned the dataframe sorted by the colnames.
    Params:
        dataframe.....The pandas Dataframe.
    Returns:
        A pandas dataframe ordered by column name.
    """
    return dataframe.reindex(sorted(dataframe.columns), axis=1)

In [4]:
# Compare if columns from each 

filename2019 = os.path.join(datadir,'2019-05-16T032020/Kickstarter005.csv')
df_example_2019 = read_dataframe(filename2019)
print(sorted(df_example_2019.columns))
print("\n")

filename2018 = os.path.join(datadir,'2018-05-17T032008/Kickstarter005.csv')
df_example_2018 = read_dataframe(filename2018)
print(sorted(df_example_2018.columns))
print("\n")

filename2017 = os.path.join(datadir,'2017-05-15T222111/Kickstarter005.csv')
df_example_2017 = read_dataframe(filename2017)
print(sorted(df_example_2017.columns))
print("\n")

filename2016 = os.path.join(datadir,'2016-05-15T020446/Kickstarter005.csv')
df_example_2016 = read_dataframe(filename2016)
print(sorted(df_example_2016.columns))


print(list(df_example_2019.columns) == list(df_example_2018.columns))
list_diff = [i for i in list(df_example_2019.columns) if i not in list(df_example_2018.columns)]
print("Diff for 2019 and 2018 %s" % list_diff)

print(list(df_example_2019.columns) == list(df_example_2017.columns))
list_diff = [i for i in list(df_example_2019.columns) if i not in list(df_example_2017.columns)]
print("Diff for 2019 and 2017 %s" % list_diff)

print(list(df_example_2019.columns) == list(df_example_2016.columns))
list_diff = [i for i in list(df_example_2019.columns) if i not in list(df_example_2016.columns)]
print("Diff for 2019 and 2016 %s" % list_diff)

print("source_url in 2016 data: %s" % ("source_url" in list(df_example_2016.columns)))
print("source_url in 2017 data: %s" % ("source_url" in list(df_example_2017.columns)))
print("source_url in 2018 data: %s" % ("source_url" in list(df_example_2018.columns)))
print("source_url in 2019 data: %s" % ("source_url" in list(df_example_2019.columns)))

['backers_count', 'blurb', 'category', 'converted_pledged_amount', 'country', 'created_at', 'creator', 'currency', 'currency_symbol', 'currency_trailing_code', 'current_currency', 'deadline', 'disable_communication', 'friends', 'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged', 'usd_type']


['backers_count', 'blurb', 'category', 'converted_pledged_amount', 'country', 'created_at', 'creator', 'currency', 'currency_symbol', 'currency_trailing_code', 'current_currency', 'deadline', 'disable_communication', 'friends', 'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug', 'spotlight', 'staff_pick', 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged', 'usd

We drop the columns `['converted_pledged_amount', 'current_currency', 'fx_rate', 'is_starrable', 'source_url', 'usd_type']`.

We then have a total of 31 columns left:
`['backers_count', 'blurb', 'category', 'country', 'created_at', 'creator', 'currency', 'currency_symbol', 'currency_trailing_code', 'deadline', 'disable_communication', 'friends', 'goal', 'id', 'is_backing', 'is_starred', 'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug', 'spotlight', 'staff_pick', 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged']`

In [109]:
#We drop Creator column
df_example_2019.creator[600]

'{"id":201462872,"name":"Jessica","is_registered":null,"chosen_currency":null,"avatar":{"thumb":"https://ksr-ugc.imgix.net/assets/007/949/250/d038cc2cebf0a2e2b61e4828846625f0_original.jpg?ixlib=rb-2.0.0&w=40&h=40&fit=crop&v=1461493126&auto=format&frame=1&q=92&s=1286f1e298c4f1bf5a879012d17b1d4e","small":"https://ksr-ugc.imgix.net/assets/007/949/250/d038cc2cebf0a2e2b61e4828846625f0_original.jpg?ixlib=rb-2.0.0&w=160&h=160&fit=crop&v=1461493126&auto=format&frame=1&q=92&s=0653fa544d8bb487804fea0f9824b456","medium":"https://ksr-ugc.imgix.net/assets/007/949/250/d038cc2cebf0a2e2b61e4828846625f0_original.jpg?ixlib=rb-2.0.0&w=160&h=160&fit=crop&v=1461493126&auto=format&frame=1&q=92&s=0653fa544d8bb487804fea0f9824b456"},"urls":{"web":{"user":"https://www.kickstarter.com/profile/201462872"},"api":{"user":"https://api.kickstarter.com/v1/users/201462872?signature=1558067776.5d17f5578109207cc50fad148c887416aef943c3"}}}'

In [23]:
# We drop currency_symbol
df_example_2019.currency_symbol.head(20)

0     EUR
1     USD
2     GBP
3     USD
4     USD
5     USD
6     EUR
7     USD
8     USD
9     CAD
10    USD
11    USD
12    USD
13    USD
14    USD
15    USD
16    EUR
17    USD
18    GBP
19    USD
Name: currency, dtype: object

In [106]:
# We drop currency_trailing_code (Appears to have true values only for USD currency values)
df_example_2019.currency_trailing_code.head(20)

0     False
1      True
2     False
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16    False
17     True
18    False
19     True
Name: currency_trailing_code, dtype: bool

In [51]:
# We drop disable_communication column
df_example_2019[df_example_2019.disable_communication == True].disable_communication

251     True
354     True
653     True
1807    True
2616    True
2869    True
3713    True
Name: disable_communication, dtype: bool

In [33]:
# We drop friends columns as it has no values in any project
df_example_2019[df_example_2019.friends.notna()].friends

460     []
1476    []
2789    []
Name: friends, dtype: object

In [41]:
# TODO: Check if goal variable is in USD or original currency 
# We drop the column is_backing as it has non values in every project.
df_example_2019[df_example_2019.is_backing.notna()].is_backing

460     False
1476    False
2789    False
Name: is_backing, dtype: object

In [47]:
# We drop the column is_starred as it has non values in every project.
df_example_2019[df_example_2019.is_starred.notna()].is_starred

460     False
1476    False
2789    False
Name: is_starred, dtype: object

In [70]:
# We drop the column permissions as it has non values in every project.
df_example_2019[df_example_2019.permissions.notna()].permissions

460     []
1476    []
2789    []
Name: permissions, dtype: object

In [75]:
# We drop the column photo as it only contains links to the photos.
df_example_2019.photo[0]

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,creator,currency,currency_symbol,currency_trailing_code,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type


In [80]:
# We drop the column profile as it contains information already provided by other columns.
df_example_2019.profile[0]

'{"id":3438765,"project_id":3438765,"state":"inactive","state_changed_at":1533133044,"name":null,"blurb":null,"background_color":null,"text_color":null,"link_background_color":null,"link_text_color":null,"link_text":null,"link_url":null,"show_feature_image":false,"background_image_opacity":0.8,"should_show_feature_image_section":true,"feature_image_attributes":{"image_urls":{"default":"https://ksr-ugc.imgix.net/assets/022/131/877/df8a04e48d2a004cfb668b640ab3c87a_original.jpg?ixlib=rb-2.0.0&crop=faces&w=1552&h=873&fit=crop&v=1533478846&auto=format&frame=1&q=92&s=43a70d311e91d2eddbf441bba2d03729","baseball_card":"https://ksr-ugc.imgix.net/assets/022/131/877/df8a04e48d2a004cfb668b640ab3c87a_original.jpg?ixlib=rb-2.0.0&crop=faces&w=560&h=315&fit=crop&v=1533478846&auto=format&frame=1&q=92&s=ba0c49e92f70c35c0e63ec25c5af2409"}}}'

In [91]:
# We drop name column for containing same as slug but slug has cleaned strings.
print(df_example_2019.name[1050])
print(df_example_2019.slug[1050])

Tuxpeedo
tuxpeedo


In [103]:
# We drop column state_changed_at as it does not provide useful information.
df_example_2019.state_changed_at

0       1536438029
1       1358734294
2       1421366412
3       1467615541
4       1405873727
5       1497642609
6       1484149238
7       1384455965
8       1534008181
9       1492041000
10      1357959631
11      1404446341
12      1412010011
13      1533095942
14      1463357276
15      1475072770
16      1480456801
17      1545672795
18      1503259200
19      1554872400
20      1539738060
21      1495906755
22      1480180740
23      1494783996
24      1490961910
25      1425691377
26      1528801463
27      1388995218
28      1465207317
29      1435500299
           ...    
3756    1370919635
3757    1540500977
3758    1458394497
3759    1384113739
3760    1524316006
3761    1531765086
3762    1418230813
3763    1404254803
3764    1498074001
3765    1467010800
3766    1552093204
3767    1401656580
3768    1427700304
3769    1557911759
3770    1426280466
3771    1494788882
3772    1409542476
3773    1484429788
3774    1555547715
3775    1521038978
3776    1557294720
3777    1407

In [111]:
# We drop the column urls although we can make an additional project taking into account the rewards in the project.
#  As it only gives us the link to the rewards we should configure a scrapping script to obtain info about the rewards.
df_example_2019.urls[0]

'{"web":{"project":"https://www.kickstarter.com/projects/ferdinand-david/ferdinand-and-david-travelmaster-watches-made-in-g?ref=discovery_category_newest","rewards":"https://www.kickstarter.com/projects/ferdinand-david/ferdinand-and-david-travelmaster-watches-made-in-g/rewards"}}'

This way we still have the columns:
`['backers_count', 'blurb', 'category', 'country', 'created_at', 'currency', 'deadline', 'goal', 'id', 'launched_at', 'location', 'pledged', 'slug', 'spotlight', 'staff_pick', 'state', 'static_usd_rate', 'usd_pledged']`