In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV


import warnings
warnings.filterwarnings('ignore')

In [None]:
#read the file
dataset=pd.read_csv('train.csv')

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
msno.bar(dataset) 

In [None]:
#droping employee ID column
dataset=dataset.drop('employee_id',axis=1)

In [None]:
#univariate analysis
dataset.hist(edgecolor='black', linewidth=1.2, figsize=(20, 20),);

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(dataset.corr(), annot=True)

In [None]:
dataset['education']=dataset['education'].fillna('Blank')

In [None]:
for col in dataset.columns:
    if dataset[col].dtype=='O':
        plt.subplots(figsize=(10,5))
        sns.countplot(x=dataset[col])
        plt.title(col, fontsize = 20)
        plt.xticks(rotation = 60)
        plt.xlabel(col)
        plt.ylabel('count')
        plt.show()

In [None]:
#bivariate analysis
for col in dataset.columns:
    data=pd.crosstab(dataset[col], dataset['is_promoted'])

    data.div(data.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 5))

    plt.title('Looking at the Dependency of ' + col+ ' in promotion', fontsize = 30)
    plt.xlabel('Average ' + col +' Scores', fontsize = 15)
    plt.legend()
    plt.show()

In [None]:
#handling missing value by fill 1 or 3
for i in range(len(dataset)):
    if dataset['is_promoted'].iloc[i]==1 and np.isnan(dataset['previous_year_rating'].iloc[i]):
        dataset['previous_year_rating'].iloc[i]=1
    elif dataset['is_promoted'].iloc[i]==0 and np.isnan(dataset['previous_year_rating'].iloc[i]):
        dataset['previous_year_rating'].iloc[i]=3
        
dataset['education'].fillna(dataset['education'].mode()[0],inplace=True)

dataset.isna().sum()

In [None]:
#seperating label & feature
x = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

In [None]:
#frequency encoding
x['department']=x['department'].map(x['department'].value_counts()/x['department'].value_counts().sum())
x['region']=x['region'].map(x['region'].value_counts()/x['region'].value_counts().sum())
x['education']=x['education'].map(x['education'].value_counts()/x['education'].value_counts().sum())
x['recruitment_channel']=x['recruitment_channel'].map(x['recruitment_channel'].value_counts()/x['recruitment_channel'].value_counts().sum())

#categorical encoding
gender={'m':1,'f':0}
x['gender']=x['gender'].map(gender)

In [None]:
#SMOTE Oversampling

x_sample, y_sample = SMOTE().fit_sample(x, y.values.ravel())

x_sample = pd.DataFrame(x_sample)
y_sample = pd.DataFrame(y_sample)

# checking the sizes of the sample data
print("Size of x-sample :", x_sample.shape)
print("Size of y-sample :", y_sample.shape)

In [None]:
#feauture engineering
sc = StandardScaler()
x_sample = sc.fit_transform(x_sample)
x_sample = pd.DataFrame(x_sample)
y_sample = pd.DataFrame(y_sample)

In [None]:
##Initializing arguments for hyperparameter tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(x_sample, y_sample)

In [None]:
#get best params
rf_random.best_params_

In [None]:
#model creation usin cross validation

rf_acc=[]
rf_pre=[]
rf_rec=[]
rf_f1=[]

kf=KFold(n_splits=5,shuffle=True)

for i,(train_index,test_index) in enumerate(kf.split(x_sample)):
        
    f_train, l_train = x_sample.iloc[train_index,:], y_sample.iloc[train_index] 
    f_test, l_test   = x_sample.iloc[test_index,:], y_sample.iloc[test_index]
    
    print('\nFold : {}\n'.format(i+1))

    rf = RandomForestClassifier(bootstrap=False,max_depth= None,max_features= 'auto',min_samples_leaf= 2,min_samples_split= 10,n_estimators= 800)
    rf.fit(f_train, l_train)
    
    predicted=rf.predict(f_test)
    
    accuracy = accuracy_score(l_test,predicted)*100
    precision=precision_score(l_test,predicted)*100
    recall=recall_score(l_test,predicted)*100
    F1_Score=f1_score(l_test,predicted)*100
    
    rf_acc.append(accuracy)
    rf_pre.append(precision)
    rf_rec.append(recall)
    rf_f1.append(F1_Score)
    
    print("accuracy : ",accuracy)
    print("Precision : ",precision)
    print("Recall : ",recall)
    print("F1_Score : ",F1_Score)
    
    
    
print('\nRandomForest mean accuracy score: {}'.format((sum(rf_acc)/5)))
print('\nRandomForest mean precision score: {}'.format((sum(rf_pre)/5)))
print('\nRandomForest mean recall score: {}'.format((sum(rf_rec)/5)))
print('\nRandomForest mean f1_score score: {}'.format((sum(rf_f1)/5)))

## Real Time we will automate all the process for prediction.We need to give the input in UI rest all will done by internally and give the prediction

In [None]:
val=[[-0.598339,-0.426224,-2.069922,-1.279600,-0.497856,-0.406918,-1.553476,1.186247,-0.920257,1.008055,4.439484,1.602237]]
rf.predict(val)