# Functions that are used in the rest of the repository

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import random
random.seed(42)

default_path= #the path
os.chdir(default_path)
df_address="single_df_filled"

fill_month function adds a row for each month for each employee where it's missing. Then it populates these new rows using ffill and bfill

In [None]:
def fill_month(df1, start_date, end_date, freq):
    month_dict={'': 0,
     'jan': 1,
     'feb': 2,
     'mar': 3,
     'apr': 4,
     'mai': 5,
     'jun': 6,
     'jul': 7,
     'aug': 8,
     'sep': 9,
     'okt': 10,
     'nov': 11,
     'des': 12}
     
    dates = df1.date.astype(str).str.split('.',expand=True)
    dates.iloc[:,-2]= dates.iloc[:,-2].replace(month_dict)
    df1['date'] = dates.iloc[:,-2].astype(str) + '/' + dates.iloc[:,-1].astype(str).map(lambda x: str(x)[-2:])
    
    #___reformat the date columns
    df1['date'] = pd.to_datetime(df1['date'], format='%m/%y')
    
    #create all months in the time range
    all_months=pd.date_range(start=start_date, end=end_date, freq=freq)
    id_list=list(df1['id'].unique())
    
    #ful index with all combinations of date and id
    full_index = pd.MultiIndex.from_product([id_list, all_months],names=['id', 'date'])
    #create a new df, filling the nan values based on the previous records 
    #!!!important that set_index and full index have the same rekkefølge
    df_t1=df1.set_index(['id','date']).reindex(full_index).reset_index()
    return df_t1





## Re-coding the categorical variables
I'm using one hot coding to recode the categorical variables. 
We'll be using the functions in the RFC notebook. 

In [None]:
def data_coding(df_address, drop_cols):
    df= pd.read_csv(('data/processed/' + df_address + '.csv'))
    df_ready= df.drop(columns= drop_cols) # removing attend and absence cause we have the average
    
    #ONE HOT CODING
    cat_vars=['host_country', 'nation', 'host_country_man', 'nation_man']  
    
    for col in cat_vars:
        #create other variable for less then 0.5 percent
        series = pd.value_counts(df_ready[col])
        mask = (series/series.sum() * 100).lt(0.55) #change lt() to change the other criteria
        # To replace df['column'] use np.where I.e 
        df_ready[col] = np.where(df_ready[col].isin(series[mask].index),'Other',df_ready[col])
    
    cat_vars=['host_country', 'nation', 'host_country_man', 'nation_man', 'BA', 'network', 'BA_man', 'network_man']  
    df_coded = pd.get_dummies(df_ready, prefix_sep="_", columns=cat_vars)
    #df_coded.drop(columns=cat_vars, axis=1, inplace=True)
    df_coded.columns.values
    
    df_coded=df_coded.astype('float64')
    
    return df_coded


drop_cols= ['date', 'name', 'home_country', 'home_country_man', 'is_norsk', 'in_norge', 'is_norsk_man',
            'in_norge_man', 'name_man', 'is_manager_man', 'manager_id', 'manager_id_man']

df_coded= data_coding(df_address, drop_cols)
df_coded.to_csv(('data/processed/' + df_address + '_coded.csv'), index=False)

## CREATE TRAIN TEST SPLITS

In [None]:
#y is the column name #identify is the id column name
def test_train_manual(main_df, y, identif):

    #divide the dataset into leavers and non-leavers
    y_1=main_df[main_df[y]==1]
    y_0=main_df[main_df[y]==0]
    #create list of unique ids to make sure data for the same id is not found in the both dataset
    y_1_id= list(y_1[identif].unique()) #list of ids for nonleavers
    y_0_id= list(y_0[identif].unique())
    
    #split the leaver and nonleaver ids into 25% and put them together for the test set
    test_id= random.sample(y_1_id, (len(y_1_id)//4)) + random.sample(y_0_id, (len(y_0_id)//4))
    train_id= [x for x in list(main_df[identif].unique()) if x not in test_id] #remaining ids is the train set
    
    #test and train dfs based on the ids selected
    test_df=main_df[main_df[identif].isin(test_id)]
    train_df=main_df[main_df[identif].isin(train_id)]
    
    #split X and y
    test_df.drop(columns=identif, inplace=True)
    train_df.drop(columns=identif, inplace=True)
    
    features=test_df.columns.values.tolist() #features are the same in both
    y=[y]
    X=[i for i in features if i not in y]
    
    X_test=test_df[X]
    X_train=train_df[X]
    y_test=test_df[y].values.ravel()
    y_train=train_df[y].values.ravel()
    return X_test, X_train, y_test, y_train


X_test, X_train, y_test, y_train = test_train_manual(df_coded, 'Leavers', 'id')

## Select features
We'll run a RFC to choose the most important features 

In [None]:
def select_features(X_test, X_train, y_test, y_train):
    rf_init = RandomForestClassifier(n_estimators= 400, random_state=42)
    rf_init.fit(X_train, y_train)
    
    print(classification_report(y_test, rf_init.predict(X_test)))
    
    #________________feature importance
    cols=X_test.columns.values.tolist()
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature,
                           importance in zip(cols, list(rf_init.feature_importances_))]
    
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    print(feature_importances)

    # Extract the names of the most important features based on the number of imp features
    important_feature_names = [feature[0]  for feature in feature_importances if feature [1]>0]
    
    #____________Create training and testing sets with only the important features
    imp_X_train = X_train[important_feature_names]
    imp_X_test = X_test[important_feature_names]
    # Sanity check on operations
    print('Important train features shape:', imp_X_train.shape)
    print('Important test features shape:', imp_X_test.shape)
    
    return imp_X_train, imp_X_test, feature_importances

imp_X_train, imp_X_test, feature_importances = select_features(X_test, X_train, y_test, y_train)
