In [1]:
# data avaialble on kaggle:
# https://www.kaggle.com/kemical/kickstarter-projects

# following a course on feature engineering by Mat Leonard on kaggle:
# https://www.kaggle.com/learn/feature-engineering


In [2]:
# import modules 
import pandas as pd
import numpy as np

In [3]:
# load part of the dataset
ks_data = pd.read_csv('../data/raw/ks-projects-201801.csv', parse_dates=['deadline','launched'])

In [4]:
# inspect the data 
ks_data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [5]:
# check datatypes and missing values 
ks_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                378661 non-null  int64         
 1   name              378657 non-null  object        
 2   category          378661 non-null  object        
 3   main_category     378661 non-null  object        
 4   currency          378661 non-null  object        
 5   deadline          378661 non-null  datetime64[ns]
 6   goal              378661 non-null  float64       
 7   launched          378661 non-null  datetime64[ns]
 8   pledged           378661 non-null  float64       
 9   state             378661 non-null  object        
 10  backers           378661 non-null  int64         
 11  country           378661 non-null  object        
 12  usd pledged       374864 non-null  float64       
 13  usd_pledged_real  378661 non-null  float64       
 14  usd_

In [6]:
# create a copy of the dataset so that we don't need to reload it every time
ks = ks_data.copy()

## I. Baseline Model

Having a baseline model is important to assess whether the features that we're creating or modifying in our data add value and improve the accuracy of our model over its baseline. 

#### Preparing target column

In [7]:
# prepare the classification variable (target column)
ks.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [8]:
# for the model we'll consider 1 -> successful and 0 -> otherwise
# ks['is_successful'] = (ks.state == 'successful').map({False:0, True:1})
# ks['is_successful'] = (ks['state'] == 'successful').astype(int)

# alternative approach using .assign()
ks = ks.assign(is_successful = (ks['state'] == 'successful').astype(int))

In [9]:
# drop live projects
ks = ks[~(ks['state'] == 'live')]

# alternative approach with .query()
# ks = ks.query('state != "live"')

#### Converting timestamps 

In [10]:
# creating new features from the 'launched' timestamp
ks = ks.assign(year = ks['launched'].dt.year,
              month = ks['launched'].dt.month,
              day = ks['launched'].dt.day,
              hour = ks['launched'].dt.hour)

# show top 5
ks.loc[:5, ['launched','year','month','day','hour']]

Unnamed: 0,launched,year,month,day,hour
0,2015-08-11 12:12:28,2015,8,11,12
1,2017-09-02 04:43:57,2017,9,2,4
2,2013-01-12 00:20:50,2013,1,12,0
3,2012-03-17 03:24:11,2012,3,17,3
4,2015-07-04 08:35:03,2015,7,4,8
5,2016-02-26 13:38:27,2016,2,26,13


#### Processing categorical variables

In [11]:
# specify which columns are categorical
categorical_cols = ['main_category', 'category', 'currency', 'country']

# import encoder
from sklearn.preprocessing import LabelEncoder

# init encoder 
le = LabelEncoder()

In [12]:
# encode (approach a) # does not work 
# encoded = le.fit_transform(ks[categorical_cols])

In [13]:
# encode (approach b) 
# using .apply())
encoded = ks[categorical_cols].apply(le.fit_transform)

# inspect
encoded.head()

Unnamed: 0,main_category,category,currency,country
0,12,108,5,9
1,6,93,13,22
2,6,93,13,22
3,10,90,13,22
4,6,55,13,22


In [14]:
# # encode (approach c)
# # using a for loop
# for col in categorical_cols:
#     ks[col + '_le'] = le.fit_transform(ks[col])
    
# # inspect
# ks.head()

#### Creating train and test sets

In [15]:
# select all features to be used to train the model
data = ks[['goal','year','month','day','hour','is_successful']].join(encoded)
data.head()

Unnamed: 0,goal,year,month,day,hour,is_successful,main_category,category,currency,country
0,1000.0,2015,8,11,12,0,12,108,5,9
1,30000.0,2017,9,2,4,0,6,93,13,22
2,45000.0,2013,1,12,0,0,6,93,13,22
3,5000.0,2012,3,17,3,0,10,90,13,22
4,19500.0,2015,7,4,8,0,6,55,13,22


In [16]:
# import train test split 
from sklearn.model_selection import train_test_split 

# create X and y 
X = data.drop('is_successful', axis=1)
y = data['is_successful']

# split the datasets setting 20% aside for test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# double check that the target variable is equally represented in train and test datasets
print('% successful in train: {:.2%}'.format(y_train.mean()))
print('% successful in test: {:.2%}'.format(y_test.mean()))

% successful in train: 35.66%
% successful in test: 35.56%


Predictor variable is slightly imbalanced - not between train and test but it's that the % of projects that are successful is less than 50% of all projects. The most common ways to address class imbalance include:
* Up-sample minority class
* Down-sample majority class
* Change the performance metric
* Use a penalized-SVM algorithm
* Use a tree-based algorithm

For this project we'll use a tree-based algorithm and our choice of performance metric is going to be `ROC AUC`.
**Source**: https://elitedatascience.com/imbalanced-classes

#### Training a RandomForest Classifier (with default settings)

In [18]:
# import classifier
from sklearn.ensemble import RandomForestClassifier

# init 
rfc = RandomForestClassifier(random_state=34)

# fit
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=34, verbose=0,
                       warm_start=False)

#### Making predictions and scoring the model

In [19]:
# predictions
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)

In [20]:
# import scoring metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# accuracy scores
print('accuracy (train): {:.2%}'.format(accuracy_score(y_train, y_pred_train)))
print('accuracy (test): {:.2%}'.format(accuracy_score(y_test, y_pred)))

# roc auc scores 
print('roc auc (train): {:.2%}'.format(roc_auc_score(y_train, y_pred_train)))
print('roc auc (test): {:.2%}'.format(roc_auc_score(y_test, y_pred)))

accuracy (train): 99.89%
accuracy (test): 68.64%
roc auc (train): 99.87%
roc auc (test): 62.76%


The default model is **overfitting** way too much but I'm not sure how to handle this other than manually changing the `max_depth` or `min_samples_leaf` parameters. `GridSearchCV` is not helpful here because it optimises parameters based on the accuracy score of the traning dataset which in our case is already very high at `>99%`. In other words, `GridSearchCV` would try to overfit the model even more. 

Having tested a couple of parameters, using `min_samples_leaf=10` increases the accuracy and roc auc of test data to `70%` and `63%`, respectively. All performance metrics for the tranining set are `<78%`.


#### Retrain the model to address the overfitting issue + predict + score

In [21]:
# init with a limit on minimum samples per leaf (chosen randomly)
rfc = RandomForestClassifier(random_state=34, min_samples_leaf=10)

# fit 
rfc.fit(X_train, y_train)

# predict
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)

# score (roc auc)
print('roc auc (train): {:.2%}'.format(roc_auc_score(y_train, y_pred_train)))
print('roc auc (test): {:.2%}'.format(roc_auc_score(y_test, y_pred)))

# save baseline ROC AUC 
roc_auc_baseline = roc_auc_score(y_test, y_pred)

roc auc (train): 71.78%
roc auc (test): 63.61%


In [22]:
# # create confusion matrix 
# cm = confusion_matrix(y_test, y_pred)

# # turn it into a DataFrame to make plotting easier 
# cm_df = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
# cm_df.index.name = 'Actual'
# cm_df.columns.name = 'Predicted'

# # # this will be used as the max value for the heatmap legend 
# # calculated_vmax = (int(cm_df.max().max()/100)+1)*100

# # import modules 
# import seaborn as sns 
# import matplotlib.pyplot as plt

# # set figure size
# plt.figure(figsize=(4,4))

# # add title
# plt.title('Confusion Matrix (Baseline Model)')

# # plot heatmap
# sns.heatmap(
#     cm_df, 
#     cmap=sns.light_palette((250, 80, 60), input='husl', n_colors=4),
#     vmin=5000,
#     vmax=45000,
#     annot=True, fmt=',d')

## 2. Categorical Encodings

`LabelEncoder` is just one of several ways categorical variables can be encoded. Other encoders include (not necessarily from `sklearn`):

* **get_dummies** `pd.get_dummies()`: creates n new columns where n is the number of unique catagories per feature. can create very sparse matrices which could lead to increased computational time
* **CountEncoder** `category_encoders.CountEncoder()`: encodes categorical features based on the number of times that they appear in the dataset. useful to separate rare categories (which will be given a similar low encoded value) vs commmon features (with a similar high count value). 
* **TargetEncoder** `category_encoders.TargetEncoder()`: replaces a categorical value with the average value of the target for that value of the feature. 

In [23]:
# first create some helper functions 
# split the dataset into test and train
def create_train_test(df):
    
    # create X and y 
    X = df.drop('is_successful', axis=1)
    y = df['is_successful']

    # split the datasets setting 20% aside for test 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X, y, X_train, X_test, y_train, y_test

# fit the model, predict and score 
def score_model(X_train, X_test, y_train, y_test):
    rfc = RandomForestClassifier(random_state=34, min_samples_leaf=10)

    # fit 
    print('Fitting model...')
    rfc.fit(X_train, y_train)

    # predict
    print('Predicting...')
    y_pred = rfc.predict(X_test)
    y_pred_train = rfc.predict(X_train)

    # score (roc auc)
    print('Done.')
    return roc_auc_score(y_test, y_pred)

#### Count Encoder

In [24]:
# starting with the these variables 
categorical_cols = ['main_category', 'category', 'currency', 'country']

In [25]:
# import module 
from category_encoders import CountEncoder

# init encoder 
ce = CountEncoder(cols=categorical_cols)

In [26]:
# encode (approach a)
encoded = ce.fit_transform(ks[categorical_cols])
encoded.head()

Unnamed: 0,main_category,category,currency,country
0,39575,1362,33853,33393
1,63253,5174,293624,290887
2,63253,5174,293624,290887
3,51637,15647,293624,290887
4,63253,10054,293624,290887


In [27]:
# encode (approach b) # does not work 
# encoded = ks[categorical_cols].apply(ce.fit_transform) 

In [28]:
# encode (approach c) # does not work 
# for col in categorical_cols:
#     ks[col + '_count'] = ce.fit_transform(ks[col])

In [29]:
# combine data with the newly encoded features 
data = ks[['year','month','day','hour','is_successful']].join(encoded)
data.head()

Unnamed: 0,year,month,day,hour,is_successful,main_category,category,currency,country
0,2015,8,11,12,0,39575,1362,33853,33393
1,2017,9,2,4,0,63253,5174,293624,290887
2,2013,1,12,0,0,63253,5174,293624,290887
3,2012,3,17,3,0,51637,15647,293624,290887
4,2015,7,4,8,0,63253,10054,293624,290887


In [30]:
# split the dataset 
X, y, X_train, X_test, y_train, y_test = create_train_test(data)

# fit, predict and calculate new score
roc_auc_new = score_model(X_train, X_test, y_train, y_test)

Fitting model...
Predicting...
Done.


In [31]:
diff = roc_auc_new - roc_auc_baseline
print('New model compared to baseline (ROC AUC): {:.2%}'.format(diff))

New model compared to baseline (ROC AUC): -3.59%


Okay.. this didn't go as expected. In the tutorial, `CountEncoder` had very little effect but there was slight improvement in the performance metric. In my case, I'm seeing a significant decrease in model's performance. Why?

#### Target Encoder

In [32]:
# import module
from category_encoders import TargetEncoder

# init
te = TargetEncoder(cols=categorical_cols)

In [33]:
# target encoding is fitted on just the training dataset 
# to prevent target leakage into the testing dataset 

# create a fresh copy of our data 
data = ks[['year','month','day','hour','is_successful']].join(ks[categorical_cols])

# split data
X, y, X_train, X_test, y_train, y_test = create_train_test(data)

# fit on train
te.fit(X_train[categorical_cols], y=y_train)

TargetEncoder(cols=['main_category', 'category', 'currency', 'country'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', min_samples_leaf=1, return_df=True,
              smoothing=1.0, verbose=0)

In [34]:
# tranform categorical features of both train and test
X_train = X_train.join(te.transform(X_train[categorical_cols]).add_suffix('_target'))
X_test = X_test.join(te.transform(X_test[categorical_cols]).add_suffix('_target'))

# inspect
X_train.head()

Unnamed: 0,year,month,day,hour,main_category,category,currency,country,main_category_target,category_target,currency_target,country_target
70176,2015,5,26,23,Film & Video,Film & Video,USD,US,0.374333,0.310345,0.372498,0.375674
250375,2012,9,14,14,Games,Tabletop Games,USD,US,0.357379,0.558198,0.372498,0.375674
166656,2013,1,31,2,Crafts,Crafts,USD,US,0.242268,0.250543,0.372498,0.375674
355336,2016,11,2,0,Crafts,Crafts,USD,US,0.242268,0.250543,0.372498,0.375674
199269,2012,12,6,0,Music,Indie Rock,USD,US,0.469162,0.640872,0.372498,0.375674


In [35]:
# drop the original categorical features 
for df in [X_train, X_test]:
    df.drop(['main_category','category','currency','country'], axis=1, inplace=True)
    
# inspect
X_train.head()

Unnamed: 0,year,month,day,hour,main_category_target,category_target,currency_target,country_target
70176,2015,5,26,23,0.374333,0.310345,0.372498,0.375674
250375,2012,9,14,14,0.357379,0.558198,0.372498,0.375674
166656,2013,1,31,2,0.242268,0.250543,0.372498,0.375674
355336,2016,11,2,0,0.242268,0.250543,0.372498,0.375674
199269,2012,12,6,0,0.469162,0.640872,0.372498,0.375674


In [36]:
# fit, predict and calculate new score
roc_auc_new = score_model(X_train, X_test, y_train, y_test)

Fitting model...
Predicting...
Done.


In [37]:
diff = roc_auc_new - roc_auc_baseline
print('New model compared to baseline (ROC AUC): {:.2%}'.format(diff))

New model compared to baseline (ROC AUC): -2.68%


Again, worse performance compared to the baseline model. Again, not sure why. Since, for me the best encoder remains `LabelEncoder()` I'm sticking with it. In the tutorial, they appeneded the newly encoded categorical features using the best performing encoder (`CatBoost()`) to their full dataset.

In [38]:
# encoded = te.transform(ks[categorical_cols])
# for col in encoded:
#     ks.insert(len(ks.columns), col + '_target', encoded[col])

# ks.head()

## 3. Feature Generation

In [39]:
# inspect dataset
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful,year,month,day,hour
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,2015,8,11,12
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,2017,9,2,4
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,2013,1,12,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,2012,3,17,3
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,2015,7,4,8


In [40]:
# create a clean DataFrame 'data'
data = ks[['year','month','day','hour','is_successful']]

#### Interactions

The easiest way to create new features is to combine existing categorical features into new ones.


In [41]:
# select categorical features
categorical_cols = ['main_category', 'category', 'currency', 'country']
interactions = pd.DataFrame(index=ks.index)

# import module to help us create combinations 
import itertools 

# create nww column names
# names = ['_'.join([col1, col2]) for col1, col2 in itertools.combinations(categorical_cols, 2)]    

for col1, col2 in itertools.combinations(categorical_cols, 2):
    # create a new label for each new interaction feature 
    col_name = '__'.join([col1, col2]) # 'main_category__country'
    
    # combine values 
    # values = ks[col1] + '_' + ks[col2] # works too
    values = ks[col1].map(str) + '_' + ks[col2].map(str)
    
    # save 
    interactions[col_name] = values
    
# inspect
interactions.head()

Unnamed: 0,main_category__category,main_category__currency,main_category__country,category__currency,category__country,currency__country
0,Publishing_Poetry,Publishing_GBP,Publishing_GB,Poetry_GBP,Poetry_GB,GBP_GB
1,Film & Video_Narrative Film,Film & Video_USD,Film & Video_US,Narrative Film_USD,Narrative Film_US,USD_US
2,Film & Video_Narrative Film,Film & Video_USD,Film & Video_US,Narrative Film_USD,Narrative Film_US,USD_US
3,Music_Music,Music_USD,Music_US,Music_USD,Music_US,USD_US
4,Film & Video_Film & Video,Film & Video_USD,Film & Video_US,Film & Video_USD,Film & Video_US,USD_US


In [42]:
# init encoder
le = LabelEncoder()

# encode 
encoded = interactions.apply(le.fit_transform)

# inspect
encoded.head()

Unnamed: 0,main_category__category,main_category__currency,main_category__country,category__currency,category__country,currency__country
0,140,168,275,1215,1900,18
1,68,94,152,1047,1630,31
2,68,94,152,1047,1630,31
3,115,148,242,1024,1595,31
4,64,94,152,630,979,31


#### Number of projects launched in the last week

In [43]:
# create a new Series with 'launched' as index , order it
launched = pd.Series(ks.index, index=ks.launched, name='n_projects_7_days').sort_index()
launched[:15]

launched
1970-01-01 01:00:00     94579
1970-01-01 01:00:00    319002
1970-01-01 01:00:00    247913
1970-01-01 01:00:00     48147
1970-01-01 01:00:00     75397
1970-01-01 01:00:00      2842
1970-01-01 01:00:00    273779
2009-04-21 21:02:48    169268
2009-04-23 00:07:53    322000
2009-04-24 21:52:03    138572
2009-04-25 17:36:21    325391
2009-04-27 14:10:39    122662
2009-04-28 13:55:41    213711
2009-04-29 02:04:21    345606
2009-04-29 02:58:50    235255
Name: n_projects_7_days, dtype: int64

In [44]:
# create a rolling 7-day count of projects, excluding the current project
rolling_7d = launched.rolling('7D').count() -1
rolling_7d[:15]

launched
1970-01-01 01:00:00    0.0
1970-01-01 01:00:00    1.0
1970-01-01 01:00:00    2.0
1970-01-01 01:00:00    3.0
1970-01-01 01:00:00    4.0
1970-01-01 01:00:00    5.0
1970-01-01 01:00:00    6.0
2009-04-21 21:02:48    0.0
2009-04-23 00:07:53    1.0
2009-04-24 21:52:03    2.0
2009-04-25 17:36:21    3.0
2009-04-27 14:10:39    4.0
2009-04-28 13:55:41    5.0
2009-04-29 02:04:21    5.0
2009-04-29 02:58:50    6.0
Name: n_projects_7_days, dtype: float64

In [45]:
# now, in order for me to be able to join onto the main dataset
# I need my 'rolling_7d' Series to have the same index as my 'ks' dataset
rolling_7d.index = launched.values
rolling_7d[:5]

94579     0.0
319002    1.0
247913    2.0
48147     3.0
75397     4.0
Name: n_projects_7_days, dtype: float64

In [46]:
# now, optionally, I can reindex to reorder it, but I could join this already if I wnated to 
rolling_7d = rolling_7d.reindex(ks.index)
# rolling_7d = pd.DataFrame(rolling_7d)

# inspect the reindexed Series 'rolling_7d'
rolling_7d[:5]

0    1409.0
1     957.0
2     739.0
3     907.0
4    1429.0
Name: n_projects_7_days, dtype: float64

#### Time since the last project in the same category 

In [47]:
# order the DataFrame by 'launched' from oldest to newest
df = ks[['category','launched']].sort_values('launched')

# limit to 10k to speed up calculations
# df = df.iloc[:1000, :]

# create a groupby object
grp = df.groupby('category')

# define a function to apply to our group object 
def calculate_td(series):
    """Calculate the difference in hours between the launch date
    of the current project versus the previous project in the same
    category.
    
    Series must be ordered by launch date."""
    return series.diff().dt.total_seconds() / 3600

timedeltas = grp.transform(calculate_td)

In [48]:
# inspect 
timedeltas[:15]

Unnamed: 0,launched
94579,
319002,
247913,
48147,
75397,
2842,0.0
273779,
169268,
322000,
138572,


Where we have `NaN` the project was the first project in its category. We need to fill these in. In the tutorial, they used `median` but I think that we can use `0` instead. 

In [49]:
# fill in missing values and reindex 
timedeltas = timedeltas.fillna(0).reindex(ks.index)

# change the column name  
timedeltas.columns = ['hrs_since_last_proj_in_cat']
timedeltas[:5]

Unnamed: 0,hrs_since_last_proj_in_cat
0,18.606111
1,5.592778
2,1.313611
3,0.635
4,16.661389


#### Number of projects from the same category launched in the last 30 days 

In [50]:
# select data, sort it by 'launched'
# needed to add another column 'is_successful' in order for the rolling().count() to work (?)
df = ks[['category','launched','is_successful']].set_index('launched').sort_index()

# limit to 10k while testing 
# df = df[:10000]

# # create a group by object
grp = df.groupby('category')

# test the rolling().count()
# grp.rolling('7D')['is_successful'].count()

# create a function to calculate the rolling count
def last_30_days(series):
    return series.rolling('30D').count() -1

projects_last_30 = grp.transform(last_30_days)
projects_last_30[:10]

Unnamed: 0_level_0,is_successful
launched,Unnamed: 1_level_1
1970-01-01 01:00:00,0.0
1970-01-01 01:00:00,0.0
1970-01-01 01:00:00,0.0
1970-01-01 01:00:00,0.0
1970-01-01 01:00:00,0.0
1970-01-01 01:00:00,1.0
1970-01-01 01:00:00,0.0
2009-04-21 21:02:48,0.0
2009-04-23 00:07:53,0.0
2009-04-24 21:52:03,0.0


In [51]:
# now, the index is messed up, so we need to reindex 

# approach a
# sort ks by 'launched' and get its index 
# this should correspond to the index in 'projects_last_30' because it had its index 'launched' sorted
ks_idx = ks.sort_values('launched').index

# start with a fresh variable 
# otherwise rerunning this cell will reorder the index randomly over and over again
projects_last_30 = grp.transform(last_30_days)

# replace idx
projects_last_30.index = ks_idx

# reindex, rename
projects_last_30 = projects_last_30.reindex(ks.index)
projects_last_30.columns = ['n_proj_in_cat_30_days']
projects_last_30.head()

Unnamed: 0,n_proj_in_cat_30_days
0,10.0
1,14.0
2,46.0
3,236.0
4,102.0


In [52]:
# # approach b
# # reuse approach used earlier
# launched = pd.Series(ks.index, index=ks.launched, name='n_projects_7_days').sort_index()

# start with a fresh variable 
# projects_last_30 = grp.transform(last_30_days)

# # replace index
# projects_last_30.index = launched.values

# # reindex, rename
# projects_last_30 = projects_last_30.reindex(ks.index)
# projects_last_30.columns = ['n_proj_in_cat_30_days']
# projects_last_30.head()

#### Transforming numerical features

In [53]:
# import modules 
# import matplotlib.pyplot as plt 

# plot goal as it is 
# plt.hist(ks.goal)