## Carregamento e Limpeza dos Dados - Autor: Lucas Mapurunga

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
df = pd.read_csv('ks-projects-201801.csv')

def to_time(str, mask):
    return dt.strptime(str, mask)

def col_dict(dataframe, col):
    unique_values = dataframe[col].unique()
    dc = {}

    for idx,val in enumerate(unique_values):
        dc[val] = idx

    return dc

def to_numeric(dataframe, col):
    """ Transforms the values of column col to a numeric mapping.
        Returns the transformed dataframe and the dictionary with the mapping."""
    df = dataframe
    dc = col_dict(df, col)
    df[col] = df[col].apply(lambda x: dc[x])

    return df, dc


"""
State feature transformation:

1 - Removing projects with state equals to 'undefined' or 'live'
2 - Change the values to make a binary classification:

successful: 1
failed: 0
canceled: 0
suspended': 0
"""

df.drop(df[(df.state == 'live') | (df.state == 'undefined')].index, inplace=True)
df['state'] = (df['state'] == 'successful').astype(int)


"""Cleans and add columns from columns already in the data"""
# Convert string to datetime and get the 
# difference in days from beginning to end of the campaign

df['running_days'] = (
    df['deadline'].apply(to_time, args=('%Y-%m-%d',)) 
    - df['launched'].apply(to_time, args=('%Y-%m-%d %H:%M:%S',))
).apply(lambda x: x.days)
  
df, cat_dict = to_numeric(df, 'category')
df, main_cat_dict = to_numeric(df, 'main_category')
df, country_dict = to_numeric(df, 'country')

# Removing unused features
df.drop('ID', axis=1, inplace=True)
df.drop('name', axis=1, inplace=True)
df.drop('deadline', axis=1, inplace=True)
df.drop('launched', axis=1, inplace=True)
df.drop('pledged', axis=1, inplace=True)
df.drop('usd_pledged', axis=1, inplace=True)
df.drop('goal', axis=1, inplace=True)
df.drop('currency', axis=1, inplace=True)
df.drop('backers', axis=1, inplace=True)
df.drop('usd_pledged_real', axis=1, inplace=True)

## Criando conjuntos de treino e teste - Autor: Erick Barros

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
y_column = 'state'
cols_labels = df.columns.values
sidx = np.argsort(cols_labels)
y_index_column = sidx[np.searchsorted(cols_labels,y_column,sorter=sidx)]

values = df.values
values_shape = values.shape

y = values[:,y_index_column]
X = np.concatenate((values[:, 0:y_index_column], values[:,y_index_column+1:values_shape[1]]), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, shuffle=True)

df = pd.DataFrame(X_train)
df.to_csv("X_train.csv")

df = pd.DataFrame(X_test)
df.to_csv("X_test.csv")

df = pd.DataFrame(y_train)
df.to_csv("y_train.csv")

df = pd.DataFrame(y_test)
df.to_csv("y_test.csv")


## Normalização dos dados (média e feature scaling) - Autor: Erick Barros

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Multilayer Perceptron - Autores: Todos os integrantes

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import neural_network
from sklearn.metrics import confusion_matrix

In [None]:
%%timeit

layers = [(1), (2), (3), (4), (5), (6), (7), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7)]
parameters = {'solver': ['sgd', 'adam'], 'max_iter': [500,1000], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes': layers}
clf = GridSearchCV(neural_network.MLPClassifier(), parameters, n_jobs=-1)
clf.fit(X_train, y_train)


print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

## Regressão logística - Autor: Erick Barros

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
%%timeit

parameters = [
  {'C': [1, 10, 100, 1000], 'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'max_iter': [100, 500]},
  {'C': [1, 10, 100, 1000], 'penalty': ['l2'], 'solver': ['lbfgs', 'sag'], 'max_iter': [100, 500]},
 ]

clf = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1)

clf.fit(X_train, y_train)


print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

## Perceptron - Autor: Erick Barros

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [None]:
%%timeit

parameters = {'penalty': ['l2', 'l1'], 'max_iter': [500,1000], 'alpha': 10.0 ** -np.arange(1, 7), 'fit_intercept': [True, False]}

clf = GridSearchCV(Perceptron(), parameters, n_jobs=-1)
clf.fit(X_train, y_train)


print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
print("Accuracy")
print(accuracy_score(y_true, y_pred))


print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()


In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd

array = [[71448,0],[40272,0]]
plt.rcParams.update({'font.size': 22})
df_cm = pd.DataFrame(array, index = ['Positivo', 'Negativo'], columns = ['Verdadeiro', 'False'])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)