In [15]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
# this is the file you should've gotten from your repo
from utils import extract_dates, get_val_scores
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split

In [16]:
df = pd.read_csv('../data/ks2.csv', encoding='utf-8', parse_dates=['deadline' , 'launched'])

In [17]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,US,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,US,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,failed,US,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,2015-07-04 08:35:03,failed,US,19500.0


In [18]:
df = extract_dates(df)

In [19]:
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'launched', 'state', 'country', 'goal', 'deadline_dayofweek',
       'deadline_dayofyear', 'deadline_days_in_month', 'deadline_is_leap_year',
       'deadline_is_month_end', 'deadline_is_month_start',
       'deadline_is_quarter_end', 'deadline_is_quarter_start',
       'deadline_is_year_end', 'deadline_is_year_start', 'deadline_quarter',
       'deadline_week', 'deadline_weekofyear', 'deadline_day', 'deadline_hour',
       'deadline_minute', 'deadline_month', 'deadline_year',
       'launched_dayofweek', 'launched_dayofyear', 'launched_days_in_month',
       'launched_is_leap_year', 'launched_is_month_end',
       'launched_is_month_start', 'launched_is_quarter_end',
       'launched_is_quarter_start', 'launched_is_year_end',
       'launched_is_year_start', 'launched_quarter', 'launched_week',
       'launched_weekofyear', 'launched_day', 'launched_hour',
       'launched_minute', 'launched_month', 'laun

In [20]:
mod1 = xgb.XGBClassifier()

In [21]:
mod1.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [22]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']

In [23]:
pipe = make_pipeline(OrdinalEncoder(), mod1)

In [24]:
scores = get_val_scores(pipe, X, y, return_test_score=True, random_state=1985, stratify=True, use_kfold=False)

In [26]:
scores

{'validation_score': 0.7263509523729186, 'test_score': 0.7005304287970199}

In [27]:
feats = pd.DataFrame({ 'Importance': pipe.steps[1][1].feature_importances_, 'Column': X.columns})
feats.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance,Column
6,0.123426,goal
3,0.087642,main_category
2,0.082025,category
43,0.053122,deadline_history_days
41,0.050469,launched_month
30,0.041295,launched_is_month_start
39,0.041036,launched_hour
44,0.036439,launched_history_days
4,0.035812,currency
5,0.034382,country


In [28]:
df['duration'] = (df['deadline'] - df['launched']).dt.days

In [31]:
cat_avgs = df.groupby('category')[['goal']].mean().rename({'goal': 'category_goal_avg'}, axis=1)

In [35]:
df = df.merge(cat_avgs, left_on='category', right_index = True)

In [36]:
df['cat_goal_pct'] = df['goal'] / df['category_goal_avg']

In [37]:
cat_avgs = df.groupby('main_category')[['goal']].mean().rename({'goal': 'main_category_goal_avg'}, axis=1)

In [38]:
df = df.merge(cat_avgs, left_on='main_category', right_index = True)

In [40]:
df['main_cat_goal_pct'] = df['goal'] / df['main_category_goal_avg']

In [42]:
df[['goal', 'main_category_goal_avg', 'main_cat_goal_pct']].head()

Unnamed: 0,goal,main_category_goal_avg,main_cat_goal_pct
0,1533.95,22590.745149,0.067902
232,6060.97,22590.745149,0.268294
318,2000.0,22590.745149,0.088532
414,10000.0,22590.745149,0.442659
517,757.52,22590.745149,0.033532


In [43]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']

In [44]:
scores = get_val_scores(pipe, X, y, random_state=1985, stratify=True, use_kfold=False)

In [45]:
scores

{'validation_score': 0.7310411148414961}

In [54]:
weekly_totals = df.groupby(['launched_year','launched_weekofyear'])[['ID']].count().reset_index().rename({'ID': 'Daily_Campaigns'}, axis=1)

In [55]:
weekly_totals.head()

Unnamed: 0,launched_year,launched_weekofyear,Daily_Campaigns
0,1970,1,6
1,2009,17,4
2,2009,18,22
3,2009,19,17
4,2009,20,23


In [56]:
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'launched', 'state', 'country', 'goal', 'deadline_dayofweek',
       'deadline_dayofyear', 'deadline_days_in_month', 'deadline_is_leap_year',
       'deadline_is_month_end', 'deadline_is_month_start',
       'deadline_is_quarter_end', 'deadline_is_quarter_start',
       'deadline_is_year_end', 'deadline_is_year_start', 'deadline_quarter',
       'deadline_week', 'deadline_weekofyear', 'deadline_day', 'deadline_hour',
       'deadline_minute', 'deadline_month', 'deadline_year',
       'launched_dayofweek', 'launched_dayofyear', 'launched_days_in_month',
       'launched_is_leap_year', 'launched_is_month_end',
       'launched_is_month_start', 'launched_is_quarter_end',
       'launched_is_quarter_start', 'launched_is_year_end',
       'launched_is_year_start', 'launched_quarter', 'launched_week',
       'launched_weekofyear', 'launched_day', 'launched_hour',
       'launched_minute', 'launched_month', 'laun

In [58]:
df.merge(weekly_totals, on=['launched_year', 'launched_year'])

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal,...,launched_year,deadline_history_days,launched_history_days,duration,category_goal_avg,cat_goal_pct,main_category_goal_avg,main_cat_goal_pct,launched_weekofyear_y,Daily_Campaigns
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,2015,2350,16658,58,5213.996468,0.294199,22590.745149,0.067902,1,387
1,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,2015,2350,16658,58,5213.996468,0.294199,22590.745149,0.067902,2,1123
2,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,2015,2350,16658,58,5213.996468,0.294199,22590.745149,0.067902,3,1538
3,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,2015,2350,16658,58,5213.996468,0.294199,22590.745149,0.067902,4,1687
4,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,2015,2350,16658,58,5213.996468,0.294199,22590.745149,0.067902,5,1825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19384373,462917959,Identity Communications Infographic (Canceled),Design,Design,USD,2010-04-10,1970-01-01 01:00:00,failed,US,500.00,...,1970,342,0,14708,40774.916305,0.012262,42199.323873,0.011849,1,6
19384374,1480763647,"Support Solo Theater! Help ""Ungrateful Daughte...",Theater,Theater,USD,2010-06-01,1970-01-01 01:00:00,failed,US,4000.00,...,1970,394,0,14760,12753.348309,0.313643,27147.451041,0.147343,1,6
19384375,1245461087,1st Super-Size Painting - Social Network Owned...,Art,Art,USD,2010-08-14,1970-01-01 01:00:00,failed,US,15000.00,...,1970,468,0,14834,47852.242038,0.313465,39467.623304,0.380058,1,6
19384376,1078942938,Resilience: 5-6 Player Expansion Board Game,Tabletop Games,Games,CAD,2018-02-01,2018-01-02 03:05:10,failed,CA,3993.93,...,2018,3196,17533,29,14743.957398,0.270886,45148.243871,0.088463,1,2
