# Data Exploration
We'll visualise the data to explore the relationships between the variables.
## Setup

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

default_path=#enter the pathway
os.chdir(default_path)

Read the dataframes that we saved in the processed folder 

In [None]:
df= pd.read_csv("data/processed/single_df_filled.csv")
man= pd.read_csv('data/processed/manAgers_man.csv')
df.info()

## Frequency table

In [None]:
#categorical variables
cat_cols = ['host_count', 'network_count', 'Band', 'is_manager', 'Gender', 'promotion', 'expat', 'org_level', 'network', 'is_norsk', 'in_norge']
#numerical variables
num_cols = ['Age', 'service_length', 'absence_avg', 'attend_avg', 'travel_avg', 'certif', 'team_size']

#Frequencies for the categorical variables
for col in cat_cols:
    print('\n' + 'For column ' + col)
    print(df[col].value_counts())


# Frequency graphs
## Categorical variables

In [None]:
for col in cat_cols:
    fig = plt.figure(figsize=(6,6)) # define plot area
    ax = fig.gca() # define axis    
    counts = df[col].value_counts().sort_index() # find the counts for each unique category
    counts.plot.bar(ax = ax, color = 'blue') # Use the plot.bar method on the counts data frame
    ax.set_title('Number of employees by ' + col) # Give the plot a main title
    ax.set_xlabel(col) # Set text for the x axis
    ax.set_ylabel('Number of employees')# Set text for y axis
    plt.show()
    fig.savefig(('plots/'+'freq_'+col))


    ax.set_xticks(x)    
    ax.set_xticklabels(x_labels, rotation=90) #set the labels and rotate them 90 deg.

## Histograms for numerical

In [None]:
for col in num_cols:
    fig = plt.figure(figsize=(6,6)) # define plot area
    ax = fig.gca() # define axis    
    df[col].plot.hist(ax = ax, bins = 10) # Use the plot.hist method on subset of the data frame
    ax.set_title('Histogram of ' + col) # Give the plot a main title
    ax.set_xlabel(col) # Set text for the x axis
    ax.set_ylabel('Number of employees')# Set text for y axis
    plt.show()

# Exploration related to the Leavers
## Mean differences and frequency 

In [None]:
df.groupby('Leavers').mean()
df['Leavers'].value_counts()

## Leaver percentage per category level

In [None]:
for col in cat_cols:
    means=df.groupby(col)['Leavers'].mean()
    means.plot.bar(color = 'green')
    plt.xlabel(col)
    plt.ylabel('Leaver PercentAge')
    plt.savefig(('plots/'+'leaver_perc_'+col), bbox_inches = "tight")
    plt.figure()
    plt.show()

%matplotlib inline  
cols=['is_manager', 'expat', 'is_norsk', 'in_norge']
for col in cols:    
    means=df.groupby(col)['Leavers'].mean()
    means.plot.bar(color = 'green')
    plt.xlabel(col)
    plt.ylabel('Leaver PercentAge')  
    plt.set_xticklabels(['No', 'Yes'], rotation=90) #set the labels and rotate them 90 deg.
#    plt.savefig(('plots/'+'leaver_perc_'+col))
    plt.figure()
    plt.show()    
    

## Kernel density graph for numerical variables

In [None]:
for col in num_cols:
    sns.set_style("whitegrid")
    sns.jointplot(col, 'service_length', data=df, kind='kde')
    plt.xlabel(col) # Set text for the x axis
    plt.ylabel('service_length')# Set text for y axis
    plt.savefig(('plots/'+'kde_service_len'+col))

## stackked bar chart color coded by Leavers

In [None]:
for col in cat_cols:
    table=pd.crosstab(df[col],df['Leavers'])
    table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
    plt.title('PercentAge of Leavers by ' + col)
    plt.xlabel(col)
    plt.ylabel('Proportion of Employees')
    plt.savefig(('plots/'+'leaver_prop_by_'+col))

##  Boxplot and violin plot

In [None]:
for col in num_cols:
    sns.set_style("whitegrid")
    sns.boxplot('Leavers', col, data=df)
    plt.xlabel('Leavers') # Set text for the x axis
    plt.ylabel(col)# Set text for y axis
    plt.savefig(('plots/'+'leaver_box_by_'+col))
    plt.figure()
    plt.show()
for col in num_cols:
    sns.set_style("whitegrid")
    sns.violinplot('Leavers', col, data=df)
    plt.xlabel('Leavers') # Set text for the x axis
    plt.ylabel(col)# Set text for y axis
    plt.show()
    

## Scatter plot
Service len by another numerical variable from num_col
  
  split by Band and by Leavers
  
  color: manager id 

In [None]:
num_cols = ['Age', 'absence_avg', 'attend_avg', 'travel_avg', 'certif']  
band_dict={'PROF':3,'PRIN PROF':4,'LEAD PROF':5,'ASSOCIATE':2,'OPER & SUPP':1,
           'MANAgeR':6, 'EXECUTIVE':7, 'SR EXEC':8}

for col in num_cols:
    g = sns.FacetGrid(df, col="Band", row = 'Leavers', 
                  hue="is_manager", palette="Set2", margin_titles=True)
    g.map(sns.regplot, col, "service_length", fit_reg = False)
    for ax, title in zip(g.axes.flat, ['OPER & SUPP','ASSOCIATE','PROF','PRIN PROF','LEAD PROF',
           'MANAgeR', 'EXECUTIVE', 'SR EXEC']):
        ax.set_title(title)
    plt.savefig(('plots/'+'scatter_by_'+col), dpi=900)
