In [None]:
# Importar as bibliotecas necessárias para este projeto
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_pandas

print("Libraries loaded!")

In [2]:
try:
    train = pd.read_csv('../../data/train.csv')
    test = pd.read_csv('../../data/test.csv')
except e:
    print("Error on trying read train dataset.")
finally:
    print("dataset loaded")


dataset loaded


In [None]:
# Taking a look at how many rows and columns the train dataset contains
rows = train.shape[0]
columns = train.shape[1]
print("The train dataset contains {0} rows and {1} columns".format(rows, columns))

In [None]:
train.head()




In [None]:
display(train.describe())


In [None]:
%matplotlib inline

In [None]:
series = pd.Series([train['target'].sum(), len(train.values)], index=['1', '0'], name='train')
series.plot.pie(figsize=(7, 7), autopct='%.2f', fontsize=16)

In [3]:
ind_vars = [] 
reg_vars = []
car_vars = []
calc_vars = []
rest_vars = []

bin_vars = []
cat_vars = []
num_ord_vars = []

for f in train.columns:
    if 'ind' in f:
        ind_vars.append(f)
    elif 'reg' in f:
        reg_vars.append(f)
    elif 'car' in f:
        car_vars.append(f)
    elif 'calc' in f:
        calc_vars.append(f)
    else:
        rest_vars.append(f)
        
    if 'bin' in f:
        bin_vars.append(f)
    elif 'cat' in f:
        cat_vars.append(f)
    else:
        num_ord_vars.append(f)
        
print('There are {} variables of grouping ind'.format(len(ind_vars)))
print('There are {} variables of grouping reg'.format(len(reg_vars)))
print('There are {} variables of grouping car'.format(len(car_vars)))
print('There are {} variables of grouping calc'.format(len(calc_vars)))
print('\n')
print('There are {} binary variables'.format(len(bin_vars)))
print('There are {} categorical variables'.format(len(cat_vars)))
print('There are {} ordinal/numerical variables'.format(len(num_ord_vars)))
print("\n")
print("So later on we can create dummy variables for the 14 categorical variables.")
print("The ordinal/numerical variables we can use as such and the bin variables are already binary.")

There are 18 variables of grouping ind
There are 3 variables of grouping reg
There are 16 variables of grouping car
There are 20 variables of grouping calc


There are 17 binary variables
There are 14 categorical variables
There are 28 ordinal/numerical variables


So later on we can create dummy variables for the 14 categorical variables.
The ordinal/numerical variables we can use as such and the bin variables are already binary.


In [None]:
import missingno as msno

train_copy = train
train_copy = train_copy.replace(-1, np.NaN)

# any() applied twice to check run the isnull check across all columns.
train_copy.isnull().any().any()
msno.bar(train_copy)

In [None]:
vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
    

In [4]:
cols_to_delete = []
th = 0.1
for col in range(0, len(bin_vars)):
    print (bin_vars[col])
    print (train[bin_vars[col]].unique())
    pp = pd.value_counts(train[bin_vars[col]])
    
    for i in range(0, len(pp)):
        if((pp[i]/float(len(train))) <= th):
            cols_to_delete.append(bin_vars[col])
            
print(cols_to_delete)
for col in cols_to_delete:
    train.drop([col], axis=1, inplace=True)
    test.drop([col], axis=1, inplace=True)
    
cat_cols_to_delete = ['ps_car_07_cat', 'ps_car_10_cat']

for col in cat_cols_to_delete:
    train.drop([col], axis=1, inplace=True)
    test.drop([col], axis=1, inplace=True)
    
    
other_cols_to_delete = ['ps_ind_14', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_reg_03']
for col in other_cols_to_delete:
    train.drop([col], axis=1, inplace=True)
    test.drop([col], axis=1, inplace=True)
    
train.to_csv('../../data/train_prepared.csv',index=False)
test.to_csv('../../data/test_prepared.csv',index=False)

ps_ind_06_bin
[0 1]
ps_ind_07_bin
[1 0]
ps_ind_08_bin
[0 1]
ps_ind_09_bin
[0 1]
ps_ind_10_bin
[0 1]
ps_ind_11_bin
[0 1]
ps_ind_12_bin
[0 1]
ps_ind_13_bin
[0 1]
ps_ind_16_bin
[0 1]
ps_ind_17_bin
[1 0]
ps_ind_18_bin
[0 1]
ps_calc_15_bin
[0 1]
ps_calc_16_bin
[1 0]
ps_calc_17_bin
[1 0]
ps_calc_18_bin
[0 1]
ps_calc_19_bin
[0 1]
ps_calc_20_bin
[1 0]
['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin']


In [None]:


pd.scatter_matrix(train, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

In [None]:
train.head()

ps_car_03_cat and ps_car_05_cat have a large proportion of records with missing values. Remove these variables.

For the other categorical variables with missing values, we can leave the missing value -1 as such.

ps_reg_03 (continuous) has missing values for 18% of all records. Replace by the mean.

ps_car_11 (ordinal) has only 5 records with misisng values. Replace by the mode.

ps_car_12 (continuous) has only 1 records with missing value. Replace by the mean.

ps_car_14(continuous) has missing values for 7% of all records. Replace by the mean.

In [None]:
train.drop(['ps_car_03_cat', 'ps_car_05_cat'], inplace=True, axis=1)
cat_vars.remove('ps_car_03_cat')
cat_vars.remove('ps_car_05_cat')

print("removing features done")

In [None]:
from sklearn.preprocessing import Imputer

# Imputing with the mean or mode
mean_imp = Imputer(missing_values=-1, strategy='mean', axis=0)
mode_imp = Imputer(missing_values=-1, strategy='most_frequent', axis=0)
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

print("Imputing done")

In [None]:
#Only categorical variable ps_car_11_cat has a bit more distinct values, although it is still reasonable. 
#To avoid having many dummy variables later on, we could replace the values 
#in this variable by the supervised ratio. Other strategies to transform this
#variable are explained in an article on KDNuggets. 
#As a result this variable can then be used as a continuous variable.


for f in cat_vars:
    dist_values = train[f].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(f, dist_values))

In [None]:
train.to_csv('../../data/train_prepared.csv',index=False)
train.to_csv('../../data/test_prepared.csv',index=False)

** @@@@@@@@@@@@@@@@@@@@@@@@@@@ **

In [None]:
# Produza uma matriz de dispersão para cada um dos pares de atributos dos dados
#pd.scatter_matrix(data, alpha = 0.3, figsize = (18,12), diagonal = 'kde');
train_target = train_data.pop('target')
train_ids = train_data.pop('id')
train_data


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

py.init_notebook_mode(connected=True)
# Formatação mais bonita para os notebooks

# Nullity or missing values by columns
#msno.matrix(df=train_copy.iloc[:,0:30], figsize= (20, 14), color=(0.42, 0.1, 0.05))

#msno.matrix(df=train_copy.iloc[:,30:57], figsize=(20, 14), color=(0.42, 0.1, 0.05))


In [None]:
colwithnan = train_copy.columns[train_copy.isnull().any()].tolist()

print("Just a reminder this dataset has %s Rows. \n" % (train_copy.shape[0]))
for col in colwithnan:
    print("Column: %s has %s NaN" % (col, train_copy[col].isnull().sum()))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)
rf.fit(train_data, train_target)
features = train_data.columns.values

print("----- Training Done -----")

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(train_data.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


In [None]:
# Scatter plot 
trace = go.Scatter(
    y = rf.feature_importances_,
    x = features,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = features
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(rf.feature_importances_, features), 
                                                            reverse = False)))
trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='Random Forest Feature importance',
    orientation='h',
)

layout = dict(
    title='Barplot of Feature importances',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [None]:
important_feature = []
for f in range(28):
    important_feature.append(indices[f])
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
print()
print(important_feature)

https://www.kaggle.com/bertcarremans/data-preparation-exploration

https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial

https://www.kaggle.com/tezdhar/faster-gini-calculation

https://github.com/snovik75/porto_seguro/blob/master/notebooks/model.ipynb

https://github.com/maksimovkonstantin/KagglePortoSeguro/blob/master/Simple%20model%20creation.ipynb

https://github.com/search?p=3&q=porto-seguro&type=Repositories&utf8=%E2%9C%93

https://www.kaggle.com/batzner/gini-coefficient-an-intuitive-explanation


https://www.kaggle.com/jeru666/easy-to-fork-porto-seguro