## Carregamento e Limpeza dos Dados

In [20]:
import pandas as pd
from datetime import datetime as dt

In [31]:
df = pd.read_csv('ks-projects-201801.csv')

def to_time(str, mask):
    return dt.strptime(str, mask)

def col_dict(dataframe, col):
    unique_values = dataframe[col].unique()
    dc = {}

    for idx,val in enumerate(unique_values):
        dc[val] = idx

    return dc

def to_numeric(dataframe, col):
    """ Transforms the values of column col to a numeric mapping.
        Returns the transformed dataframe and the dictionary with the mapping."""
    df = dataframe
    dc = col_dict(df, col)
    df[col] = df[col].apply(lambda x: dc[x])

    return df, dc


"""
State feature transformation:

1 - Removing projects with state equals to 'undefined' or 'live'
2 - Change the values to make a binary classification:

successful: 1
failed: 0
canceled: 0
suspended': 0
"""

df.drop(df[(df.state == 'live') | (df.state == 'undefined')].index, inplace=True)
df['state'] = (df['state'] == 'successful').astype(int)


"""Cleans and add columns from columns already in the data"""
# Convert string to datetime and get the 
# difference in days from beginning to end of the campaign

df['running_days'] = (
    df['deadline'].apply(to_time, args=('%Y-%m-%d',)) 
    - df['launched'].apply(to_time, args=('%Y-%m-%d %H:%M:%S',))
).apply(lambda x: x.days)
  
df, cat_dict = to_numeric(df, 'category')
df, main_cat_dict = to_numeric(df, 'main_category')
df, country_dict = to_numeric(df, 'country')

# Removing unused features
df.drop('ID', axis=1, inplace=True)
df.drop('name', axis=1, inplace=True)
df.drop('deadline', axis=1, inplace=True)
df.drop('launched', axis=1, inplace=True)
df.drop('pledged', axis=1, inplace=True)
df.drop('usd_pledged', axis=1, inplace=True)
df.drop('goal', axis=1, inplace=True)
df.drop('currency', axis=1, inplace=True)

[0 1]


## Normalização dos dados (média e feature scaling)

In [10]:
import numpy as np

In [11]:
y_column = 'state'
columns_to_normalize = ['running_days', 'backers', 'usd_pledged_real', 'usd_goal_real']
cols_labels = df.columns.values
sidx = np.argsort(cols_labels)
index_columns = sidx[np.searchsorted(cols_labels,columns_to_normalize,sorter=sidx)]
y_index_columns = sidx[np.searchsorted(cols_labels,y_column,sorter=sidx)]

values = df.values
values_shape = values.shape
    
for j in index_columns:
    max_value = np.max(values[:,j])
    min_value = np.min(values[:,j])
    mean = np.sum(values[:,j]) / values_shape[0]

    i = 0
    while i < values_shape[0]:
        values[i, j] = (values[i, j] - mean) / (max_value - min_value)
        i = i + 1

y = values[:,y_index_columns]
X = np.concatenate((values[:, 0:y_index_columns], values[:,y_index_columns+1:values_shape[1]]), axis=1)

## Rede Neural Artificial

#### Testando convergência do Perceptron para verificar se os dados são linearmente separáveis

In [32]:
from sklearn.linear_model import Perceptron

In [34]:
perceptron = Perceptron(random_state = 0)
perceptron.fit(X, y)
predicted = perceptron.predict(X)



In [37]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [48]:
cm = confusion_matrix(y, predicted)

#plt.clf() 
#plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
# classNames = ['Negative','Positive']
# plt.title('Perceptron Confusion Matrix - Entire Data')
# plt.ylabel('True label')
# plt.xlabel('Predicted label')
# tick_marks = np.arange(len(classNames))
# plt.xticks(tick_marks, classNames, rotation=45)
# plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
  
for i in range(2):
    for j in range(2):
        print(str(s[i][j])+" = "+str(cm[i][j]))
        #plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))

# TODO - Apresentar o gráfico da matriz de confusão
#plt.show()

TN = 195659
FP = 0
FN = 38411
TP = 0


#### Testando convergência do Multilayer Perceptron

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [14]:
test_x = X[0:379,:]
test_y = y[0:379]

x_train, x_test, y_train, y_test = train_test_split(test_x, test_y, test_size= 0.3, random_state=27)
# clf = MLPClassifier(hidden_layer_sizes=(7, 50, 1), max_iter=1000, alpha=1, solver='sgd', verbose=10, random_state=21,tol=0.000000001)
clf = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, alpha=0.001, solver='sgd', verbose=10, random_state=21,tol=0.000000001)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

c = 0
i = 0
while i < y_test.shape[0]:
    if y_test[i] == y_pred[i]:
        c = c + 1
    i = i + 1
print(y_test.shape[0])
print(c)

Iteration 1, loss = 2.19129222
Iteration 2, loss = 1.14581330
Iteration 3, loss = 1.15306590
Iteration 4, loss = 1.17781989
Iteration 5, loss = 1.20699634
Iteration 6, loss = 1.19064743
Iteration 7, loss = 1.18469700
Iteration 8, loss = 1.17042699
Iteration 9, loss = 1.15402763
Iteration 10, loss = 1.14826339
Iteration 11, loss = 1.13899131
Iteration 12, loss = 1.08335030
Iteration 13, loss = 1.12904251
Iteration 14, loss = 1.03254984
Iteration 15, loss = 1.02353163
Iteration 16, loss = 1.01762309
Iteration 17, loss = 1.01760645
Iteration 18, loss = 1.01878239
Iteration 19, loss = 1.01654529
Iteration 20, loss = 1.00657859
Iteration 21, loss = 1.00522781
Iteration 22, loss = 1.01226827
Iteration 23, loss = 1.03912607
Iteration 24, loss = 0.99212537
Iteration 25, loss = 1.01019784
Iteration 26, loss = 0.98868965
Iteration 27, loss = 1.01085377
Iteration 28, loss = 1.00216404
Iteration 29, loss = 0.98443377
Iteration 30, loss = 0.98305795
Iteration 31, loss = 0.99104282
Iteration 32, los