# Exploratory data analysis

In [76]:
import os
import warnings
import pandas as pd
import numpy as np
import ydata_profiling as yp
import sweetviz as sv
from autoviz.AutoViz_Class import AutoViz_Class
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
%matplotlib inline

In [66]:
df = pd.read_csv('../database/kaggle_voluntary_turnover_1.csv', sep=',', encoding='utf-8')
df.head()

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.86607,no
1,operations,0,0.7519,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no


In [89]:
df.groupby(['tenure'])['left'].value_counts()

tenure  left
2.0     yes        2
        no         1
3.0     yes       30
        no        27
4.0     no       332
        yes      186
5.0     no      1395
        yes      428
6.0     no      1835
        yes      478
7.0     no      1386
        yes      823
8.0     no      1065
        yes      823
9.0     no       564
        yes       14
10.0    no       127
11.0    no        23
12.0    no         1
Name: left, dtype: int64

In [62]:
try:
    os.makedirs('./data_profiling')
    print('No directory found.')
    print('New directory created: data_profiling.')
except:
    print('Directory found.')

Directory found.


### Ydata-profiling

In [63]:
folder_to_check = '../database'

for file in os.listdir(folder_to_check):
    if file.endswith(".csv"):
        path_to_file = f'{folder_to_check}/{file}'
        df = pd.read_csv(path_to_file, sep=',', encoding='utf-8')
        profile = yp.ProfileReport(
            df, 
            title=f'Ydata Profiling Report for {file}',
            explorative=True
        )
        file = file.replace('.csv','')
        profile.to_file(f'./data_profiling/ydata_profiling_{file}.html')
    else:
        print(f'File {file} skipped!')

File dados_gridsearchcv.pkl skipped!
File dados_pre_processados.pkl skipped!


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Sweetviz profiling

In [80]:
folder_to_check = '../database'

for file in os.listdir(folder_to_check):
    if file.endswith(".csv"):
        path_to_file = f'{folder_to_check}/{file}'
        df = pd.read_csv(path_to_file, sep=',', encoding='utf-8')
        report=sv.analyze(df, 'left')
        file = file.replace('.csv','')
        report.show_html(f'./data_profiling/sweetviz_report_{file}.html', open_browser=False)#, layout='vertical')
    else:
        print(f'File {file} skipped!')

File dados_gridsearchcv.pkl skipped!
File dados_pre_processados.pkl skipped!


                                             |          | [  0%]   00:00 -> (? left)

Report ./data_profiling/sweetviz_report_kaggle_voluntary_turnover_1.html was generated.


### Autoviz profiling

In [73]:
folder_to_check = '../database'
target_variable = 'left'

for file in os.listdir(folder_to_check):
    if file.endswith('.csv'):
        path_to_file = f'{folder_to_check}/{file}'
        df = pd.read_csv(path_to_file, sep=',', encoding='utf-8')
        report=sv.analyze(df)
        av = AutoViz_Class()
        file = file.replace('.csv','')
        try:
            os.makedirs(f'./data_profiling/{file}_plots')
            print('No directory found.')
            print(f'New directory created: {file}_plots.')
            viz = av.AutoViz(
                '',
                sep=',',
                depVar=target_variable,
                dfte=df,
                header=0,
                verbose=2,
                lowess=True,
                chart_format='png',
                max_rows_analyzed=200000,
                max_cols_analyzed=100,
                save_plot_dir=f'./data_profiling/{file}_plots'
            )
            print(f'File {file}_plots saved!')
        except:
            print('Directory found.')
            viz = av.AutoViz(
                '',
                sep=',',
                depVar=target_variable,
                dfte=df,
                header=0,
                verbose=2,
                lowess=True,
                chart_format='png',
                max_rows_analyzed=200000,
                max_cols_analyzed=100,
                save_plot_dir=f'./data_profiling/{file}_plots'
            )
            print(f'File {file}_plots saved!')
    else:
        print(f'File {file} skipped!')

File dados_gridsearchcv.pkl skipped!
File dados_pre_processados.pkl skipped!


                                             |          | [  0%]   00:00 -> (? left)

Directory found.
Shape of your Data Set loaded: (9540, 10)
############## C L A S S I F Y I N G  V A R I A B L E S  ####################
Classifying variables in data set...
Data Set Shape: 9540 rows, 10 cols
Data Set columns info:
* department: 0 nulls, 10 unique vals, most common: {'sales': 1883, 'retail': 1541}
* promoted: 0 nulls, 2 unique vals, most common: {0: 9251, 1: 289}
* review: 0 nulls, 9540 unique vals, most common: {0.5775686596355698: 1, 0.7193418075620294: 1}
* projects: 0 nulls, 4 unique vals, most common: {3: 5833, 4: 3058}
* salary: 0 nulls, 3 unique vals, most common: {'medium': 6611, 'high': 1548}
* tenure: 0 nulls, 11 unique vals, most common: {6.0: 2313, 7.0: 2209}
* satisfaction: 0 nulls, 9540 unique vals, most common: {0.6267589740293295: 1, 0.3428019240965125: 1}
* bonus: 0 nulls, 2 unique vals, most common: {0: 7517, 1: 2023}
* avg_hrs_month: 0 nulls, 9540 unique vals, most common: {180.8660696668475: 1, 187.1782305793793: 1}
* left: 0 nulls, 2 unique vals, m