## Carregamento e Limpeza dos Dados

In [4]:
import pandas as pd
from datetime import datetime as dt

In [5]:
df = pd.read_csv('ks-projects-201801.csv')

def to_time(str, mask):
    return dt.strptime(str, mask)

def col_dict(dataframe, col):
    unique_values = dataframe[col].unique()
    dc = {}

    for idx,val in enumerate(unique_values):
        dc[val] = idx

    return dc

def to_numeric(dataframe, col):
    """ Transforms the values of column col to a numeric mapping.
        Returns the transformed dataframe and the dictionary with the mapping."""
    df = dataframe
    dc = col_dict(df, col)
    df[col] = df[col].apply(lambda x: dc[x])

    return df, dc


"""
State feature transformation:

1 - Removing projects with state equals to 'undefined' or 'live'
2 - Change the values to make a binary classification:

successful: 1
failed: 0
canceled: 0
suspended': 0
"""

df.drop(df[(df.state == 'live') | (df.state == 'undefined')].index, inplace=True)
df['state'] = (df['state'] == 'successful').astype(int)


"""Cleans and add columns from columns already in the data"""
# Convert string to datetime and get the 
# difference in days from beginning to end of the campaign

df['running_days'] = (
    df['deadline'].apply(to_time, args=('%Y-%m-%d',)) 
    - df['launched'].apply(to_time, args=('%Y-%m-%d %H:%M:%S',))
).apply(lambda x: x.days)
  
df, cat_dict = to_numeric(df, 'category')
df, main_cat_dict = to_numeric(df, 'main_category')
df, country_dict = to_numeric(df, 'country')

# Removing unused features
df.drop('ID', axis=1, inplace=True)
df.drop('name', axis=1, inplace=True)
df.drop('deadline', axis=1, inplace=True)
df.drop('launched', axis=1, inplace=True)
df.drop('pledged', axis=1, inplace=True)
df.drop('usd_pledged', axis=1, inplace=True)
df.drop('goal', axis=1, inplace=True)
df.drop('currency', axis=1, inplace=True)
df.drop('backers', axis=1, inplace=True)
df.drop('usd_pledged_real', axis=1, inplace=True)

## Criando conjuntos de treino e teste

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

In [7]:
y_column = 'state'
cols_labels = df.columns.values
sidx = np.argsort(cols_labels)
y_index_column = sidx[np.searchsorted(cols_labels,y_column,sorter=sidx)]

values = df.values
values_shape = values.shape

y = values[:,y_index_column]
X = np.concatenate((values[:, 0:y_index_column], values[:,y_index_column+1:values_shape[1]]), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, shuffle=True)

df = pd.DataFrame(X_train)
df.to_csv("X_train.csv")

df = pd.DataFrame(X_test)
df.to_csv("X_test.csv")

df = pd.DataFrame(y_train)
df.to_csv("y_train.csv")

df = pd.DataFrame(y_test)
df.to_csv("y_train.csv")


## Normalização dos dados (média e feature scaling)

In [8]:
#import numpy as np
from sklearn.preprocessing import StandardScaler

In [10]:
# y_column = 'state'
# columns_to_normalize = ['running_days', 'backers', 'usd_pledged_real', 'usd_goal_real']
# cols_labels = df.columns.values
# sidx = np.argsort(cols_labels)
# index_columns = sidx[np.searchsorted(cols_labels,columns_to_normalize,sorter=sidx)]
# y_index_columns = sidx[np.searchsorted(cols_labels,y_column,sorter=sidx)]

# values = df.values
# values_shape = values.shape
    
# for j in index_columns:
#     max_value = np.max(values[:,j])
#     min_value = np.min(values[:,j])
#     mean = np.sum(values[:,j]) / values_shape[0]

#     i = 0
#     while i < values_shape[0]:
#         values[i, j] = (values[i, j] - mean) / (max_value - min_value)
#         i = i + 1

# y = values[:,y_index_columns]
# X = np.concatenate((values[:, 0:y_index_columns], values[:,y_index_columns+1:values_shape[1]]), axis=1)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Multilayer Perceptron

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import neural_network

In [12]:
"""
The number of hidden neurons should be between the size of the input layer and the size of the output layer.
The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.
The number of hidden neurons should be less than twice the size of the input layer.
"""



#for na in neural_arch:
#    classifiers.append(MLPClassifier(hidden_layer_sizes=na, max_iter=500, alpha=0.001, solver='adam', verbose=True, tol=0.000000001))

#for c in classifiers:
#    c.fit(X_train, y_train)
#    y_pred = c.predict(X_test)
#    reports.append(classification_report(y_test, y_pred))

#for r in reports:
#    print(r)

# c = MLPClassifier(hidden_layer_sizes=(5, 5))
layers = [(1), (2), (3), (4), (5), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
parameters = {'solver': ['adam'], 'max_iter': [500,1000], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes': layers}
clf = GridSearchCV(neural_network.MLPClassifier(), parameters, n_jobs=-1)
clf.fit(X_train, y_train)


print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))



Best parameters set found on development set:

{'alpha': 0.01, 'hidden_layer_sizes': (5, 5), 'max_iter': 500, 'solver': 'adam'}

Grid scores on development set:

0.642 (+/-0.001) for {'alpha': 0.1, 'hidden_layer_sizes': 1, 'max_iter': 500, 'solver': 'adam'}
0.642 (+/-0.001) for {'alpha': 0.1, 'hidden_layer_sizes': 1, 'max_iter': 1000, 'solver': 'adam'}
0.642 (+/-0.000) for {'alpha': 0.1, 'hidden_layer_sizes': 2, 'max_iter': 500, 'solver': 'adam'}
0.642 (+/-0.001) for {'alpha': 0.1, 'hidden_layer_sizes': 2, 'max_iter': 1000, 'solver': 'adam'}
0.645 (+/-0.004) for {'alpha': 0.1, 'hidden_layer_sizes': 3, 'max_iter': 500, 'solver': 'adam'}
0.644 (+/-0.003) for {'alpha': 0.1, 'hidden_layer_sizes': 3, 'max_iter': 1000, 'solver': 'adam'}
0.645 (+/-0.003) for {'alpha': 0.1, 'hidden_layer_sizes': 4, 'max_iter': 500, 'solver': 'adam'}
0.644 (+/-0.000) for {'alpha': 0.1, 'hidden_layer_sizes': 4, 'max_iter': 1000, 'solver': 'adam'}
0.646 (+/-0.002) for {'alpha': 0.1, 'hidden_layer_sizes': 5, 'max_

              precision    recall  f1-score   support

         0.0       0.67      0.94      0.78     71451
         1.0       0.60      0.16      0.26     40239

   micro avg       0.66      0.66      0.66    111690
   macro avg       0.63      0.55      0.52    111690
weighted avg       0.64      0.66      0.59    111690



### Apresentação de métricas do modelo

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))