In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import zero_one_loss

In [46]:
df = pd.read_csv('data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Name,Url,Goal,Pledged,Backers,Tiers,TierMin,TierMax,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,MemoBox: actually organize all your messaging ...,http://www.kickstarter.com/projects/memobox/me...,700,132,6,2,10,100,failed
1,Pockit: The Portable Work Station,http://www.kickstarter.com/projects/1827963875...,50000,2,1,3,2,10,failed
2,Form-N-Fit NIOSH Approved N95,http://www.kickstarter.com/projects/n95-thermo...,500000,1654,22,8,62,5000,failed
3,DoggZam!,http://www.kickstarter.com/projects/doggzam/do...,2000,11,2,1,10,10,failed
4,Sustainable energy when there is wind and no sun,http://www.kickstarter.com/projects/ffabw/sust...,17000,722,11,1,20,20,failed


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Data columns (total 9 columns):
Name       600 non-null object
Url        600 non-null object
Goal       600 non-null object
Pledged    600 non-null object
Backers    600 non-null int64
Tiers      600 non-null int64
TierMin    600 non-null int64
TierMax    600 non-null int64
Status     600 non-null object
dtypes: int64(4), object(5)
memory usage: 46.9+ KB


In [48]:
df['Status'].value_counts()

failed     399
success    201
Name: Status, dtype: int64

In [49]:
df['Goal'] = pd.to_numeric(df['Goal'], errors='coerce')
df['Pledged'] = pd.to_numeric(df['Pledged'], errors='coerce')
df.isna().sum()

Name        0
Url         0
Goal       64
Pledged    64
Backers     0
Tiers       0
TierMin     0
TierMax     0
Status      0
dtype: int64

In [50]:
df.duplicated().sum()

0

In [51]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 536 entries, 0 to 599
Data columns (total 9 columns):
Name       536 non-null object
Url        536 non-null object
Goal       536 non-null float64
Pledged    536 non-null float64
Backers    536 non-null int64
Tiers      536 non-null int64
TierMin    536 non-null int64
TierMax    536 non-null int64
Status     536 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 41.9+ KB


In [52]:
X = df.drop(['Status', 'Pledged'], axis=1)
y = df['Status']
print(X.shape)
print(y.shape)

(536, 7)
(536,)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=29031996)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(375, 7) (375,)
(161, 7) (161,)


In [68]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_titles=1):
        self.num_top_titles = num_top_titles
    def fit(self, X_df, y=None):
        return self
    def transform(self, X_df, y=None):
        return X_df.drop(['Name', 'Url'], axis=1)
    
ColumnDropper().fit_transform(X_train).shape

(375, 5)

In [69]:
full_pipeline = make_pipeline(ColumnDropper(),
                              MinMaxScaler(),
                              MLPClassifier(hidden_layer_sizes=(5), activation='tanh', solver='lbfgs', random_state=0, max_iter=2500)) 

In [70]:
res = cross_val_score(full_pipeline, X_train, y_train, cv=5)

In [72]:
res.mean()

0.9013333333333333

In [74]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columndropper', ColumnDropper()),
                ('minmaxscaler', MinMaxScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=5,
                               max_iter=2500, random_state=0,
                               solver='lbfgs'))])

In [79]:
prediction = full_pipeline.predict(X_test)
error = zero_one_loss(prediction, y_test)
score = 1 - error
score

0.9316770186335404