In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr


class AutoEDA:
    def __init__(self):
        pass

    def read_file(self, file_path):
        """
        Reads a file and returns a pandas DataFrame.
        
        Parameters:
        - file_path (str): The path to the file.
        
        Returns:
        - DataFrame: A pandas DataFrame.
        """
        # Determine the file extension
        file_extension = file_path.split('.')[-1].lower()
        
        # Read the file based on the file extension
        try:
            if file_extension == 'csv':
                return pd.read_csv(file_path, index_col=0)
            elif file_extension in ['xls', 'xlsx']:
                return pd.read_excel(file_path)
            elif file_extension == 'json':
                return pd.read_json(file_path)
            elif file_extension == 'pkl':
                return pd.read_pickle(file_path)
            else:
                raise ValueError("Reading this format is not yet implemented.")
        except Exception as e:
            return f"File reading failed, error: {e}."


    def explo_df(self, DataFrame, column=None):
        """
        Explores a DataFrame or a specific column and prints various statistics.
        
        Parameters:
        - DataFrame: The DataFrame to explore.
        - column (str or list, optional): The column or columns to explore. If None, explore the entire DataFrame.
        """
        if column is None:
            # General DataFrame exploration
            print("DataFrame Information:")
            display(DataFrame.info())
            print("\nFirst 10 rows of the DataFrame:")
            display(DataFrame.head(10))
            print("\nLast 10 rows of the DataFrame:")
            display(DataFrame.tail(10))
            print("\nStatistical description of the DataFrame (numeric):")
            display(DataFrame.describe().T)
            print("\nStatistical description of the DataFrame (categorical):")
            display(DataFrame.describe(include='object').T)
            print("\nCount of null values per column:")
            display(DataFrame.isnull().sum())
            print("\nPercentage of null values per column (only columns with nulls):")
            null_percentage = round(DataFrame.isnull().sum()/DataFrame.shape[0]*100, 2)
            display(null_percentage[null_percentage > 0])
            print("\nRows with all values as null:")
            all_null_rows = DataFrame[DataFrame.isnull().all(axis=1)]
            if not all_null_rows.empty:
                display(all_null_rows)
            else:
                print("There is no rows with all values as null.")
            print("\nCount of duplicate rows:")
            display(DataFrame.duplicated().sum())
        else:
            # Column(s) exploration
            if isinstance(column, str):
                column = [column]  # Convert to list if a single column is passed as a string
            for col in column:
                print(f"\nExploration of the column: {col}")
                if DataFrame[col].dtype in ['int64', 'float64']:
                    print("\nStatistical description (numeric):")
                    display(DataFrame[col].describe())
                else:
                    print("\nStatistical description (categorical):")
                    display(DataFrame[col].describe(include='object'))
                print("\nCount of null values:")
                display(DataFrame[col].isnull().sum())
                print("\nCount of unique values:")
                display(DataFrame[col].nunique())
                print("\nUnique values:")
                display(DataFrame[col].unique())
                print("\nValue Counts:")
                display(DataFrame[col].value_counts())
                print("\nMost frequent value (mode):")
                display(DataFrame[col].mode().iloc[0])
                print("\nCount of duplicates in the column:")
                display(DataFrame.duplicated(subset=[col]).sum())


In [3]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'Age')#aqui cambiamos el nombre de la columna 


Exploration of the column: Age

Statistical description (categorical):


count     1614
unique      54
top         35
freq        84
Name: Age, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


54


Unique values:


array(['51', '52', '42', '47', '46', '48', '59', '41', '56', '38', '55',
       '40', '58', '35', '45', '33', '36', '34', 'forty-seven', '53',
       '43', '60', '32', '37', '49', '39', '50', '44', '30',
       'fifty-eight', '29', '31', '54', '57', '27', 'thirty-six', '28',
       '26', 'fifty-five', '25', 'fifty-two', 'thirty-one', '24',
       'thirty', '23', '22', '21', '20', 'twenty-six', '19',
       'thirty-seven', '18', 'thirty-two', 'twenty-four'], dtype=object)


Value Counts:


Age
35              84
34              83
31              82
29              78
36              74
38              64
32              64
30              63
33              61
40              60
27              54
28              53
37              53
42              50
45              48
41              46
39              45
26              45
43              40
46              38
44              34
50              31
25              27
24              26
47              26
49              25
55              24
51              20
48              20
53              20
54              20
52              18
56              17
22              16
21              16
23              15
58              14
20              11
59              10
19               9
18               8
60               5
57               5
thirty-two       2
forty-seven      1
fifty-eight      1
thirty-six       1
fifty-five       1
fifty-two        1
thirty-one       1
thirty           1
twenty-six       1
thirty-s


Most frequent value (mode):


'35'


Count of duplicates in the column:


np.int64(1560)

In [4]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'Attrition')#aqui cambiamos el nombre de la columna 


Exploration of the column: Attrition

Statistical description (categorical):


count     1614
unique       2
top         No
freq      1355
Name: Attrition, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


2


Unique values:


array(['No', 'Yes'], dtype=object)


Value Counts:


Attrition
No     1355
Yes     259
Name: count, dtype: int64


Most frequent value (mode):


'No'


Count of duplicates in the column:


np.int64(1612)

In [5]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'BusinessTravel')#aqui cambiamos el nombre de la columna 


Exploration of the column: BusinessTravel

Statistical description (categorical):


count               842
unique                3
top       travel_rarely
freq                586
Name: BusinessTravel, dtype: object


Count of null values:


np.int64(772)


Count of unique values:


3


Unique values:


array([nan, 'travel_rarely', 'travel_frequently', 'non-travel'],
      dtype=object)


Value Counts:


BusinessTravel
travel_rarely        586
travel_frequently    165
non-travel            91
Name: count, dtype: int64


Most frequent value (mode):


'travel_rarely'


Count of duplicates in the column:


np.int64(1610)

In [6]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'DailyRate')#aqui cambiamos el nombre de la columna 


Exploration of the column: DailyRate

Statistical description (categorical):


count     1614
unique     849
top       nan$
freq       124
Name: DailyRate, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


849


Unique values:


array(['684,0$', '699,0$', '532,0$', '359,0$', '1319,0$', '117,0$',
       '1435,0$', '635,0$', '1276,0$', '840,0$', '247,0$', '1369,0$',
       '201,0$', '1360,0$', '692,0$', '1398,0$', '286,0$', '1402,0$',
       '819,0$', '884,0$', '1238,0$', '515,0$', '1223,0$', '202,0$',
       '928,0$', '607,0$', '266,0$', '429,0$', '589,0$', 'nan$',
       '1180,0$', '1282,0$', '776,0$', '665,0$', '526,0$', '1034,0$',
       '1403,0$', '1499,0$', '580,0$', '859,0$', '263,0$', '1376,0$',
       '885,0$', '1003,0$', '1321,0$', '394,0$', '1372,0$', '1333,0$',
       '228,0$', '737,0$', '823,0$', '667,0$', '301,0$', '573,0$',
       '1329,0$', '630,0$', '1063,0$', '1017,0$', '1296,0$', '939,0$',
       '1355,0$', '1448,0$', '200,0$', '1202,0$', '404,0$', '208,0$',
       '813,0$', '465,0$', '1189,0$', '1001,0$', '1394,0$', '161,0$',
       '288,0$', '682,0$', '1354,0$', '147,0$', '119,0$', '1413,0$',
       '452,0$', '334,0$', '1132,0$', '982,0$', '480,0$', '1099,0$',
       '672,0$', '1379,0$', '58


Value Counts:


DailyRate
nan$       124
691,0$       7
329,0$       7
147,0$       6
530,0$       6
          ... 
1294,0$      1
478,0$       1
305,0$       1
590,0$       1
211,0$       1
Name: count, Length: 849, dtype: int64


Most frequent value (mode):


'nan$'


Count of duplicates in the column:


np.int64(765)

In [7]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'Department')#aqui cambiamos el nombre de la columna 


Exploration of the column: Department

Statistical description (categorical):


count                          302
unique                           3
top        Research & Development 
freq                           196
Name: Department, dtype: object


Count of null values:


np.int64(1312)


Count of unique values:


3


Unique values:


array([nan, ' Research & Development ', ' Sales ', ' Human Resources '],
      dtype=object)


Value Counts:


Department
Research & Development     196
Sales                       91
Human Resources             15
Name: count, dtype: int64


Most frequent value (mode):


' Research & Development '


Count of duplicates in the column:


np.int64(1610)

In [8]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'DistanceFromHome')#aqui cambiamos el nombre de la columna 


Exploration of the column: DistanceFromHome

Statistical description (numeric):


count    1614.000000
mean        4.527261
std        14.591913
min       -49.000000
25%         2.000000
50%         5.000000
75%        11.000000
max        29.000000
Name: DistanceFromHome, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


69


Unique values:


array([  6,   1,   4,   2,   3,  22,  25,   9,   7,  23,  10,  12,  14,
       -13,  15,   8, -42,  28, -37,   5,  16, -35,  26, -26,  24,  29,
       -25,  17,  21, -18, -10, -30, -27,  20, -31, -29, -39,  18, -21,
       -15,  11,  13, -14,  19, -33, -34, -46, -36, -19,  27, -12, -23,
       -45, -28, -47, -32, -24, -16, -22, -41, -49, -11, -48, -38, -20,
       -17, -43, -40, -44])


Value Counts:


DistanceFromHome
 2     217
 1     203
 10     86
 9      85
 8      81
      ... 
-21      2
-43      2
-28      2
-39      1
-40      1
Name: count, Length: 69, dtype: int64


Most frequent value (mode):


np.int64(2)


Count of duplicates in the column:


np.int64(1545)

In [9]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'Education')#aqui cambiamos el nombre de la columna 


Exploration of the column: Education

Statistical description (numeric):


count    1614.000000
mean        2.925031
std         1.022357
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         5.000000
Name: Education, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


5


Unique values:


array([3, 4, 2, 1, 5])


Value Counts:


Education
3    621
4    445
2    314
1    180
5     54
Name: count, dtype: int64


Most frequent value (mode):


np.int64(3)


Count of duplicates in the column:


np.int64(1609)

In [10]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'EducationField')#aqui cambiamos el nombre de la columna 


Exploration of the column: EducationField

Statistical description (categorical):


count               869
unique                6
top       Life Sciences
freq                349
Name: EducationField, dtype: object


Count of null values:


np.int64(745)


Count of unique values:


6


Unique values:


array([nan, 'Life Sciences', 'Technical Degree', 'Medical', 'Other',
       'Marketing', 'Human Resources'], dtype=object)


Value Counts:


EducationField
Life Sciences       349
Medical             276
Marketing           104
Technical Degree     69
Other                59
Human Resources      12
Name: count, dtype: int64


Most frequent value (mode):


'Life Sciences'


Count of duplicates in the column:


np.int64(1607)

In [11]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'employeecount')#aqui cambiamos el nombre de la columna 


Exploration of the column: employeecount

Statistical description (numeric):


count    1614.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: employeecount, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


1


Unique values:


array([1])


Value Counts:


employeecount
1    1614
Name: count, dtype: int64


Most frequent value (mode):


np.int64(1)


Count of duplicates in the column:


np.int64(1613)

In [12]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'employeenumber')#aqui cambiamos el nombre de la columna 


Exploration of the column: employeenumber

Statistical description (categorical):


count      1183
unique     1079
top       495,0
freq          2
Name: employeenumber, dtype: object


Count of null values:


np.int64(431)


Count of unique values:


1079


Unique values:


array(['162,0', '259,0', '319,0', ..., '2012,0', '2023,0', '2040,0'],
      dtype=object)


Value Counts:


employeenumber
495,0    2
501,0    2
502,0    2
507,0    2
517,0    2
        ..
164,0    1
190,0    1
194,0    1
226,0    1
998,0    1
Name: count, Length: 1079, dtype: int64


Most frequent value (mode):


'1044,0'


Count of duplicates in the column:


np.int64(534)

In [13]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'EnvironmentSatisfaction')#aqui cambiamos el nombre de la columna 


Exploration of the column: EnvironmentSatisfaction

Statistical description (numeric):


count    1614.000000
mean        4.294919
std         6.993559
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        49.000000
Name: EnvironmentSatisfaction, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


38


Unique values:


array([ 1,  3,  4,  2, 42, 37, 35, 25, 27, 31, 39, 21, 15, 14, 33, 19, 12,
       13, 28, 47, 36, 29, 24, 46, 16, 22, 41, 49, 11, 48, 18, 10, 45, 38,
       17, 20, 26, 43])


Value Counts:


EnvironmentSatisfaction
4     460
3     459
1     298
2     297
12      7
35      6
13      6
24      5
47      5
14      5
41      4
42      4
46      4
36      4
48      4
20      3
22      3
11      3
18      3
45      3
27      3
25      3
17      3
37      2
38      2
16      2
31      2
15      2
19      2
29      2
28      1
33      1
21      1
39      1
49      1
10      1
26      1
43      1
Name: count, dtype: int64


Most frequent value (mode):


np.int64(4)


Count of duplicates in the column:


np.int64(1576)

In [14]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'Gender')#aqui cambiamos el nombre de la columna 


Exploration of the column: Gender

Statistical description (numeric):


count    1614.000000
mean        0.398389
std         0.489718
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Gender, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


2


Unique values:


array([0, 1])


Value Counts:


Gender
0    971
1    643
Name: count, dtype: int64


Most frequent value (mode):


np.int64(0)


Count of duplicates in the column:


np.int64(1612)

In [15]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'HourlyRate')#aqui cambiamos el nombre de la columna 


Exploration of the column: HourlyRate

Statistical description (categorical):


count              1614
unique               72
top       Not Available
freq                 84
Name: HourlyRate, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


72


Unique values:


array(['51', '65', '58', '82', '45', '99', '91', '64', '55', '68', '49',
       '61', '79', '31', '69', '48', '80', '74', '98', '59', '33', '56',
       '66', '57', '53', '87', '81', '84', '32', '41', '92', '47',
       'Not Available', '43', '86', '30', '42', '88', '96', '67', '62',
       '72', '78', '89', '52', '50', '90', '37', '94', '76', '60', '46',
       '83', '100', '40', '97', '54', '75', '39', '85', '63', '44', '93',
       '36', '35', '73', '71', '70', '38', '77', '95', '34'], dtype=object)


Value Counts:


HourlyRate
Not Available    84
42               33
66               32
48               30
84               29
                 ..
50               15
53               13
68               13
38               12
34               11
Name: count, Length: 72, dtype: int64


Most frequent value (mode):


'Not Available'


Count of duplicates in the column:


np.int64(1542)

In [16]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, 'JobInvolvement')#aqui cambiamos el nombre de la columna 


Exploration of the column: JobInvolvement

Statistical description (numeric):


count    1614.000000
mean        2.739777
std         0.711567
min         1.000000
25%         2.000000
50%         3.000000
75%         3.000000
max         4.000000
Name: JobInvolvement, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


4


Unique values:


array([3, 2, 4, 1])


Value Counts:


JobInvolvement
3    955
2    406
4    164
1     89
Name: count, dtype: int64


Most frequent value (mode):


np.int64(3)


Count of duplicates in the column:


np.int64(1610)

In [18]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "JobLevel")



Exploration of the column: JobLevel

Statistical description (numeric):


count    1614.000000
mean        2.068154
std         1.101344
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         5.000000
Name: JobLevel, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


5


Unique values:


array([5, 4, 3, 2, 1])


Value Counts:


JobLevel
2    597
1    586
3    242
4    113
5     76
Name: count, dtype: int64


Most frequent value (mode):


np.int64(2)


Count of duplicates in the column:


np.int64(1609)

In [19]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "JobRole")


Exploration of the column: JobRole

Statistical description (categorical):


count          1614
unique         1579
top        mANager 
freq              4
Name: JobRole, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


1579


Unique values:


array([' resEArch DIREcToR ', ' ManAGeR ', ' ManaGER ', ...,
       ' sAlES ExECUTivE ', ' SaLes ExecUtIVe ',
       ' mAnUfactURInG DiRECTOr '], dtype=object)


Value Counts:


JobRole
mANager                   4
MANAgER                   3
ManageR                   3
mAnaGeR                   3
ManagEr                   3
                         ..
LAbOrATOry techNicIan     1
sAlEs Executive           1
rEsEARch sCientIST        1
ResEaRcH scIEnTIsT        1
SalEs exECUTIvE           1
Name: count, Length: 1579, dtype: int64


Most frequent value (mode):


' mANager '


Count of duplicates in the column:


np.int64(35)

In [20]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "JobSatisfaction")


Exploration of the column: JobSatisfaction

Statistical description (numeric):


count    1614.000000
mean        2.738538
std         1.106163
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: JobSatisfaction, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


4


Unique values:


array([3, 4, 1, 2])


Value Counts:


JobSatisfaction
4    514
3    481
1    317
2    302
Name: count, dtype: int64


Most frequent value (mode):


np.int64(4)


Count of duplicates in the column:


np.int64(1610)

In [21]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "MaritalStatus")


Exploration of the column: MaritalStatus

Statistical description (categorical):


count         963
unique          5
top       Married
freq          404
Name: MaritalStatus, dtype: object


Count of null values:


np.int64(651)


Count of unique values:


5


Unique values:


array([nan, 'Married', 'Divorced', 'Single', 'divorced', 'Marreid'],
      dtype=object)


Value Counts:


MaritalStatus
Married     404
Single      325
Divorced    188
Marreid      35
divorced     11
Name: count, dtype: int64


Most frequent value (mode):


'Married'


Count of duplicates in the column:


np.int64(1608)

In [22]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "MonthlyIncome")


Exploration of the column: MonthlyIncome

Statistical description (categorical):


count        771
unique       668
top       6347,0
freq           4
Name: MonthlyIncome, dtype: object


Count of null values:


np.int64(843)


Count of unique values:


668


Unique values:


array(['19537,0', '19999,0', '19232,0', '17169,0', nan, '17174,0',
       '16595,0', '19973,0', '13402,0', '13206,0', '19545,0', '18041,0',
       '19246,0', '10748,0', '16752,0', '6201,0', '19845,0', '4001,0',
       '10447,0', '16064,0', '3210,0', '10266,0', '10475,0', '6162,0',
       '4721,0', '4615,0', '16959,0', '10306,0', '5406,0', '5902,0',
       '10855,0', '5914,0', '6646,0', '13973,0', '13320,0', '6687,0',
       '4735,0', '13872,0', '19045,0', '16015,0', '9613,0', '11510,0',
       '4306,0', '17046,0', '5067,0', '3692,0', '19847,0', '2308,0',
       '5747,0', '10422,0', '6347,0', '2348,0', '3072,0', '12490,0',
       '8020,0', '17068,0', '8943,0', '19272,0', '5577,0', '2691,0',
       '7403,0', '8823,0', '3579,0', '17779,0', '18213,0', '13577,0',
       '19190,0', '17123,0', '19187,0', '10008,0', '7988,0', '7083,0',
       '4723,0', '3407,0', '2929,0', '12031,0', '15427,0', '5126,0',
       '9619,0', '5010,0', '19033,0', '10400,0', '2793,0', '5674,0',
       '19197,0', '841


Value Counts:


MonthlyIncome
6347,0     4
5304,0     4
2657,0     3
2258,0     3
2380,0     2
          ..
19845,0    1
6201,0     1
16752,0    1
10748,0    1
19246,0    1
Name: count, Length: 668, dtype: int64


Most frequent value (mode):


'5304,0'


Count of duplicates in the column:


np.int64(945)

In [23]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "MonthlyRate")


Exploration of the column: MonthlyRate

Statistical description (numeric):


count     1614.000000
mean     14284.495663
std       7110.414585
min       2094.000000
25%       8001.000000
50%      14248.500000
75%      20364.000000
max      26999.000000
Name: MonthlyRate, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


1427


Unique values:


array([ 6462,  5678,  4933, ..., 15302, 26956, 16642])


Value Counts:


MonthlyRate
9150     4
6069     3
4156     3
11737    3
25326    3
        ..
17089    1
19982    1
12449    1
5829     1
24164    1
Name: count, Length: 1427, dtype: int64


Most frequent value (mode):


np.int64(9150)


Count of duplicates in the column:


np.int64(187)

In [24]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "NUMCOMPANIESWORKED")


Exploration of the column: NUMCOMPANIESWORKED

Statistical description (numeric):


count    1614.000000
mean        2.673482
std         2.506152
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max         9.000000
Name: NUMCOMPANIESWORKED, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


10


Unique values:


array([7, 0, 1, 3, 2, 4, 8, 9, 5, 6])


Value Counts:


NUMCOMPANIESWORKED
1    573
0    226
3    169
4    157
2    156
7     84
6     73
5     66
9     59
8     51
Name: count, dtype: int64


Most frequent value (mode):


np.int64(1)


Count of duplicates in the column:


np.int64(1604)

In [25]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "Over18")



Exploration of the column: Over18

Statistical description (categorical):


count     713
unique      1
top         Y
freq      713
Name: Over18, dtype: object


Count of null values:


np.int64(901)


Count of unique values:


1


Unique values:


array(['Y', nan], dtype=object)


Value Counts:


Over18
Y    713
Name: count, dtype: int64


Most frequent value (mode):


'Y'


Count of duplicates in the column:


np.int64(1612)

In [26]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "OverTime")



Exploration of the column: OverTime

Statistical description (categorical):


count     938
unique      2
top        No
freq      682
Name: OverTime, dtype: object


Count of null values:


np.int64(676)


Count of unique values:


2


Unique values:


array(['No', nan, 'Yes'], dtype=object)


Value Counts:


OverTime
No     682
Yes    256
Name: count, dtype: int64


Most frequent value (mode):


'No'


Count of duplicates in the column:


np.int64(1611)

In [27]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "PercentSalaryHike")


Exploration of the column: PercentSalaryHike

Statistical description (numeric):


count    1614.000000
mean       15.165428
std         3.648610
min        11.000000
25%        12.000000
50%        14.000000
75%        18.000000
max        25.000000
Name: PercentSalaryHike, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


15


Unique values:


array([13, 14, 11, 19, 12, 25, 16, 17, 22, 23, 20, 15, 21, 24, 18])


Value Counts:


PercentSalaryHike
11    232
13    230
12    225
14    220
15    110
18     98
17     88
16     86
19     82
20     60
22     59
21     51
23     29
24     25
25     19
Name: count, dtype: int64


Most frequent value (mode):


np.int64(11)


Count of duplicates in the column:


np.int64(1599)

In [28]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "PerformanceRating")


Exploration of the column: PerformanceRating

Statistical description (categorical):


count     1419
unique       2
top        3,0
freq      1205
Name: PerformanceRating, dtype: object


Count of null values:


np.int64(195)


Count of unique values:


2


Unique values:


array(['3,0', '4,0', nan], dtype=object)


Value Counts:


PerformanceRating
3,0    1205
4,0     214
Name: count, dtype: int64


Most frequent value (mode):


'3,0'


Count of duplicates in the column:


np.int64(1611)

In [29]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "RelationshipSatisfaction")


Exploration of the column: RelationshipSatisfaction

Statistical description (numeric):


count    1614.000000
mean        2.704461
std         1.079031
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: RelationshipSatisfaction, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


4


Unique values:


array([3, 1, 4, 2])


Value Counts:


RelationshipSatisfaction
3    504
4    468
2    339
1    303
Name: count, dtype: int64


Most frequent value (mode):


np.int64(3)


Count of duplicates in the column:


np.int64(1610)

In [30]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "StandardHours")


Exploration of the column: StandardHours

Statistical description (categorical):


count      419
unique       1
top       80,0
freq       419
Name: StandardHours, dtype: object


Count of null values:


np.int64(1195)


Count of unique values:


1


Unique values:


array([nan, '80,0'], dtype=object)


Value Counts:


StandardHours
80,0    419
Name: count, dtype: int64


Most frequent value (mode):


'80,0'


Count of duplicates in the column:


np.int64(1612)

In [31]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "StockOptionLevel")


Exploration of the column: StockOptionLevel

Statistical description (numeric):


count    1614.000000
mean        0.791202
std         0.842396
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: StockOptionLevel, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


4


Unique values:


array([0, 1, 2, 3])


Value Counts:


StockOptionLevel
0    687
1    666
2    172
3     89
Name: count, dtype: int64


Most frequent value (mode):


np.int64(0)


Count of duplicates in the column:


np.int64(1610)

In [32]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "TOTALWORKINGYEARS")


Exploration of the column: TOTALWORKINGYEARS

Statistical description (categorical):


count     1088
unique      40
top       10,0
freq       144
Name: TOTALWORKINGYEARS, dtype: object


Count of null values:


np.int64(526)


Count of unique values:


40


Unique values:


array([nan, '34,0', '22,0', '28,0', '20,0', '21,0', '33,0', '40,0',
       '18,0', '25,0', '15,0', '17,0', '26,0', '16,0', '24,0', '14,0',
       '23,0', '27,0', '19,0', '11,0', '38,0', '37,0', '13,0', '12,0',
       '29,0', '10,0', '36,0', '35,0', '9,0', '31,0', '32,0', '8,0',
       '7,0', '30,0', '6,0', '5,0', '4,0', '3,0', '2,0', '1,0', '0,0'],
      dtype=object)


Value Counts:


TOTALWORKINGYEARS
10,0    144
8,0      86
6,0      84
9,0      69
5,0      66
7,0      56
4,0      54
1,0      53
12,0     34
3,0      32
13,0     30
14,0     30
11,0     29
16,0     28
15,0     28
20,0     28
18,0     27
21,0     23
17,0     22
2,0      21
22,0     18
19,0     17
24,0     14
28,0     13
23,0     13
0,0       8
26,0      8
33,0      6
36,0      6
29,0      6
25,0      6
27,0      5
37,0      5
31,0      4
32,0      3
40,0      3
30,0      3
35,0      3
34,0      2
38,0      1
Name: count, dtype: int64


Most frequent value (mode):


'10,0'


Count of duplicates in the column:


np.int64(1573)

In [33]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "TrainingTimesLastYear")


Exploration of the column: TrainingTimesLastYear

Statistical description (numeric):


count    1614.000000
mean        2.809789
std         1.297765
min         0.000000
25%         2.000000
50%         3.000000
75%         3.000000
max         6.000000
Name: TrainingTimesLastYear, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


7


Unique values:


array([5, 3, 2, 0, 1, 4, 6])


Value Counts:


TrainingTimesLastYear
2    598
3    534
4    137
5    136
1     77
6     72
0     60
Name: count, dtype: int64


Most frequent value (mode):


np.int64(2)


Count of duplicates in the column:


np.int64(1607)

In [34]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "WORKLIFEBALANCE")


Exploration of the column: WORKLIFEBALANCE

Statistical description (categorical):


count     1506
unique       4
top        3,0
freq       913
Name: WORKLIFEBALANCE, dtype: object


Count of null values:


np.int64(108)


Count of unique values:


4


Unique values:


array(['3,0', nan, '2,0', '4,0', '1,0'], dtype=object)


Value Counts:


WORKLIFEBALANCE
3,0    913
2,0    359
4,0    155
1,0     79
Name: count, dtype: int64


Most frequent value (mode):


'3,0'


Count of duplicates in the column:


np.int64(1609)

In [35]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "YearsAtCompany")


Exploration of the column: YearsAtCompany

Statistical description (numeric):


count    1614.000000
mean        7.132590
std         6.124237
min         0.000000
25%         3.000000
50%         5.000000
75%         9.000000
max        40.000000
Name: YearsAtCompany, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


37


Unique values:


array([20, 33, 22, 19, 21, 18, 24, 31, 26, 16, 23, 15, 17, 32, 14, 13, 25,
       12, 11, 37, 40, 36, 27, 29, 10,  9, 30,  8,  7, 34,  6,  5,  4,  2,
        3,  1,  0])


Value Counts:


YearsAtCompany
5     208
1     171
2     141
3     141
10    133
7     115
4     114
8     106
9      94
6      78
0      44
11     36
20     29
13     26
15     21
14     19
22     17
18     15
12     15
21     14
16     14
19     12
17      9
24      7
33      5
25      5
26      4
31      3
32      3
27      3
36      3
23      2
40      2
29      2
37      1
30      1
34      1
Name: count, dtype: int64


Most frequent value (mode):


np.int64(5)


Count of duplicates in the column:


np.int64(1577)

In [36]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "YearsInCurrentRole")


Exploration of the column: YearsInCurrentRole

Statistical description (categorical):


count      34
unique     10
top       2,0
freq       11
Name: YearsInCurrentRole, dtype: object


Count of null values:


np.int64(1580)


Count of unique values:


10


Unique values:


array([nan, '13,0', '12,0', '11,0', '7,0', '6,0', '4,0', '3,0', '2,0',
       '1,0', '0,0'], dtype=object)


Value Counts:


YearsInCurrentRole
2,0     11
7,0      5
0,0      4
4,0      3
1,0      3
11,0     2
6,0      2
3,0      2
12,0     1
13,0     1
Name: count, dtype: int64


Most frequent value (mode):


'2,0'


Count of duplicates in the column:


np.int64(1603)

In [37]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "YearsSinceLastPromotion")


Exploration of the column: YearsSinceLastPromotion

Statistical description (numeric):


count    1614.000000
mean        2.245973
std         3.235665
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: YearsSinceLastPromotion, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


16


Unique values:


array([15, 11,  5,  2,  4,  7,  0,  1, 13, 14,  8, 12,  3,  6, 10,  9])


Value Counts:


YearsSinceLastPromotion
0     625
1     384
2     177
7      93
4      67
3      62
5      53
6      37
11     26
8      20
9      18
15     15
12     11
14     10
13     10
10      6
Name: count, dtype: int64


Most frequent value (mode):


np.int64(0)


Count of duplicates in the column:


np.int64(1598)

In [38]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "YEARSWITHCURRMANAGER")


Exploration of the column: YEARSWITHCURRMANAGER

Statistical description (numeric):


count    1614.000000
mean        4.220570
std         3.562695
min         0.000000
25%         2.000000
50%         3.000000
75%         7.000000
max        17.000000
Name: YEARSWITHCURRMANAGER, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


18


Unique values:


array([15,  9,  6,  8,  7, 11, 10, 12,  4,  0,  5, 17,  2, 14,  1, 13,  3,
       16])


Value Counts:


YEARSWITHCURRMANAGER
2     380
0     270
7     267
3     148
8     115
4     104
1      84
9      70
5      36
10     31
6      30
11     22
12     20
13     16
17      8
15      5
14      5
16      3
Name: count, dtype: int64


Most frequent value (mode):


np.int64(2)


Count of duplicates in the column:


np.int64(1596)

In [39]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "SameAsMonthlyIncome")


Exploration of the column: SameAsMonthlyIncome

Statistical description (categorical):


count        771
unique       668
top       6347,0
freq           4
Name: SameAsMonthlyIncome, dtype: object


Count of null values:


np.int64(843)


Count of unique values:


668


Unique values:


array(['19537,0', '19999,0', '19232,0', '17169,0', nan, '17174,0',
       '16595,0', '19973,0', '13402,0', '13206,0', '19545,0', '18041,0',
       '19246,0', '10748,0', '16752,0', '6201,0', '19845,0', '4001,0',
       '10447,0', '16064,0', '3210,0', '10266,0', '10475,0', '6162,0',
       '4721,0', '4615,0', '16959,0', '10306,0', '5406,0', '5902,0',
       '10855,0', '5914,0', '6646,0', '13973,0', '13320,0', '6687,0',
       '4735,0', '13872,0', '19045,0', '16015,0', '9613,0', '11510,0',
       '4306,0', '17046,0', '5067,0', '3692,0', '19847,0', '2308,0',
       '5747,0', '10422,0', '6347,0', '2348,0', '3072,0', '12490,0',
       '8020,0', '17068,0', '8943,0', '19272,0', '5577,0', '2691,0',
       '7403,0', '8823,0', '3579,0', '17779,0', '18213,0', '13577,0',
       '19190,0', '17123,0', '19187,0', '10008,0', '7988,0', '7083,0',
       '4723,0', '3407,0', '2929,0', '12031,0', '15427,0', '5126,0',
       '9619,0', '5010,0', '19033,0', '10400,0', '2793,0', '5674,0',
       '19197,0', '841


Value Counts:


SameAsMonthlyIncome
6347,0     4
5304,0     4
2657,0     3
2258,0     3
2380,0     2
          ..
19845,0    1
6201,0     1
16752,0    1
10748,0    1
19246,0    1
Name: count, Length: 668, dtype: int64


Most frequent value (mode):


'5304,0'


Count of duplicates in the column:


np.int64(945)

In [40]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "DateBirth")


Exploration of the column: DateBirth

Statistical description (numeric):


count    1614.000000
mean     1986.076208
std         9.101332
min      1963.000000
25%      1980.000000
50%      1987.000000
75%      1993.000000
max      2005.000000
Name: DateBirth, dtype: float64


Count of null values:


np.int64(0)


Count of unique values:


43


Unique values:


array([1972, 1971, 1981, 1976, 1977, 1975, 1964, 1982, 1967, 1985, 1968,
       1983, 1965, 1988, 1978, 1990, 1987, 1989, 1970, 1980, 1963, 1991,
       1986, 1974, 1984, 1973, 1979, 1993, 1994, 1992, 1969, 1966, 1996,
       1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005])


Value Counts:


DateBirth
1988    84
1989    83
1992    83
1994    78
1987    75
1991    65
1993    64
1985    64
1990    61
1983    60
1986    55
1996    54
1995    53
1981    50
1978    48
1982    46
1997    46
1984    45
1980    40
1977    38
1979    34
1973    31
1999    27
1976    27
1998    27
1974    25
1968    25
1972    20
1969    20
1970    20
1975    20
1971    19
1967    17
2002    16
2001    16
2000    15
1965    15
2003    11
1964    10
2004     9
2005     8
1963     5
1966     5
Name: count, dtype: int64


Most frequent value (mode):


np.int64(1988)


Count of duplicates in the column:


np.int64(1571)

In [41]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "Salary")


Exploration of the column: Salary

Statistical description (categorical):


count            1614
unique              1
top       1000000000$
freq             1614
Name: Salary, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


1


Unique values:


array(['1000000000$'], dtype=object)


Value Counts:


Salary
1000000000$    1614
Name: count, dtype: int64


Most frequent value (mode):


'1000000000$'


Count of duplicates in the column:


np.int64(1613)

In [42]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "RoleDepartament")


Exploration of the column: RoleDepartament

Statistical description (categorical):


count                     302
unique                    301
top        MaNAgeR  -  Sales 
freq                        2
Name: RoleDepartament, dtype: object


Count of null values:


np.int64(1312)


Count of unique values:


301


Unique values:


array([nan, ' ManaGER  -  Research & Development ',
       ' MANAger  -  Research & Development ',
       ' heaLtHcArE repResENtATiVe  -  Research & Development ',
       ' saLEs exEcutIVe  -  Sales ', ' Sales ExECuTIVe  -  Sales ',
       ' heALthCArE RePResEntaTIVe  -  Research & Development ',
       ' lABORAtOry tECHnIcIan  -  Research & Development ',
       ' ManufactUrINg DiReCTOr  -  Research & Development ',
       ' SaleS eXEcUtIVE  -  Sales ',
       ' HEaLthcaRe rEprEsEnTAtiVe  -  Research & Development ',
       ' rESeArCH sCientIST  -  Research & Development ',
       ' healTHCAre rePREseNtATiVE  -  Research & Development ',
       ' mANaGer  -  Research & Development ',
       ' HEAltHCaRe REPreseNTatIve  -  Research & Development ',
       ' SaLeS eXECUTivE  -  Sales ',
       ' reSeARCH SCIEnTiST  -  Research & Development ',
       ' ReSearch DiRecTOR  -  Research & Development ',
       ' resEaRCh SciEnTist  -  Research & Development ',
       ' hUmaN REsoUrCes  -  H


Value Counts:


RoleDepartament
MaNAgeR  -  Sales                                        2
hEalthCaRe reprEseNTaTiVe  -  Research & Development     1
SAlES exeCutive  -  Sales                                1
labORAtoRy tEcHNICIAN  -  Research & Development         1
rEsEaRCh diReCtOR  -  Research & Development             1
                                                        ..
HUMAN ResoURCeS  -  Human Resources                      1
HeAlThCarE rEpreSentatiVe  -  Research & Development     1
MaNageR  -  Research & Development                       1
mAnAgEr  -  Research & Development                       1
Sales exECutIVE  -  Sales                                1
Name: count, Length: 301, dtype: int64


Most frequent value (mode):


' MaNAgeR  -  Sales '


Count of duplicates in the column:


np.int64(1312)

In [43]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "NUMBERCHILDREN")


Exploration of the column: NUMBERCHILDREN

Statistical description (numeric):


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: NUMBERCHILDREN, dtype: float64


Count of null values:


np.int64(1614)


Count of unique values:


0


Unique values:


array([nan])


Value Counts:


Series([], Name: count, dtype: int64)


Most frequent value (mode):


IndexError: single positional indexer is out-of-bounds

In [44]:
autoEda = AutoEDA()
df = autoEda.read_file("HR RAW DATA.csv")
autoEda.explo_df(df, "RemoteWork")


Exploration of the column: RemoteWork

Statistical description (categorical):


count     1614
unique       5
top          1
freq       360
Name: RemoteWork, dtype: object


Count of null values:


np.int64(0)


Count of unique values:


5


Unique values:


array(['Yes', '1', 'False', '0', 'True'], dtype=object)


Value Counts:


RemoteWork
1        360
True     345
0        309
False    305
Yes      295
Name: count, dtype: int64


Most frequent value (mode):


'1'


Count of duplicates in the column:


np.int64(1609)

In [45]:
autoEda = AutoEDA()
df = autoEda.read_file("hr_raw_data_limpio_.csv")
autoEda.explo_df(df, "DailyRate")


Exploration of the column: DailyRate

Statistical description (numeric):


count    1490.000000
mean      802.085235
std       403.246954
min       103.000000
25%       468.250000
50%       798.000000
75%      1157.000000
max      1499.000000
Name: DailyRate, dtype: float64


Count of null values:


np.int64(124)


Count of unique values:


848


Unique values:


array([ 684.,  699.,  532.,  359., 1319.,  117., 1435.,  635., 1276.,
        840.,  247., 1369.,  201., 1360.,  692., 1398.,  286., 1402.,
        819.,  884., 1238.,  515., 1223.,  202.,  928.,  607.,  266.,
        429.,  589.,   nan, 1180., 1282.,  776.,  665.,  526., 1034.,
       1403., 1499.,  580.,  859.,  263., 1376.,  885., 1003., 1321.,
        394., 1372., 1333.,  228.,  737.,  823.,  667.,  301.,  573.,
       1329.,  630., 1063., 1017., 1296.,  939., 1355., 1448.,  200.,
       1202.,  404.,  208.,  813.,  465., 1189., 1001., 1394.,  161.,
        288.,  682., 1354.,  147.,  119., 1413.,  452.,  334., 1132.,
        982.,  480., 1099.,  672., 1379.,  583., 1492., 1050.,  469.,
        237., 1440., 1291., 1157., 1336., 1224.,  735., 1389.,  638.,
       1240.,  194., 1339.,  111., 1469.,  470., 1232., 1249.,  798.,
        549.,  570.,  541.,  164., 1117.,  619.,  319.,  956., 1245.,
       1397.,  527.,  213.,  882.,  330.,  406.,  217.,  481.,  669.,
       1465.,  685.,


Value Counts:


DailyRate
329.0     7
691.0     7
147.0     6
530.0     6
408.0     6
         ..
181.0     1
176.0     1
964.0     1
1360.0    1
1369.0    1
Name: count, Length: 848, dtype: int64


Most frequent value (mode):


np.float64(329.0)


Count of duplicates in the column:


np.int64(765)

In [46]:
autoEda = AutoEDA()
df = autoEda.read_file("hr_raw_data_limpio_.csv")
autoEda.explo_df(df, "HourlyRate")


Exploration of the column: HourlyRate

Statistical description (numeric):


count    1530.000000
mean       66.028105
std        20.185954
min        30.000000
25%        48.000000
50%        66.000000
75%        84.000000
max       100.000000
Name: HourlyRate, dtype: float64


Count of null values:


np.int64(84)


Count of unique values:


71


Unique values:


array([ 51.,  65.,  58.,  82.,  45.,  99.,  91.,  64.,  55.,  68.,  49.,
        61.,  79.,  31.,  69.,  48.,  80.,  74.,  98.,  59.,  33.,  56.,
        66.,  57.,  53.,  87.,  81.,  84.,  32.,  41.,  92.,  47.,  nan,
        43.,  86.,  30.,  42.,  88.,  96.,  67.,  62.,  72.,  78.,  89.,
        52.,  50.,  90.,  37.,  94.,  76.,  60.,  46.,  83., 100.,  40.,
        97.,  54.,  75.,  39.,  85.,  63.,  44.,  93.,  36.,  35.,  73.,
        71.,  70.,  38.,  77.,  95.,  34.])


Value Counts:


HourlyRate
42.0    33
66.0    32
48.0    30
57.0    29
84.0    29
        ..
50.0    15
53.0    13
68.0    13
38.0    12
34.0    11
Name: count, Length: 71, dtype: int64


Most frequent value (mode):


np.float64(42.0)


Count of duplicates in the column:


np.int64(1542)