# Let's predict which Data Scientist is going to change his job in future???

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 ## 1. Let's read the data first 

In [None]:
train= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
test.head()

## So these are the features--->

### Features
enrollee_id : Unique ID for enrollee

city: City code

citydevelopmentindex: Developement index of the city (scaled)

gender: Gender of enrolee

relevent_experience: Relevent experience of enrolee

enrolled_university: Type of University course enrolled if any

education_level: Education level of enrolee

major_discipline :Education major discipline of enrolee

experience: Enrolee total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

## Let's gather some more knowledge about our dataset

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.columns

In [None]:
train.describe(include='all')

## 2. Checking for missing values

In [None]:
train.isnull().sum()

## Let's fill the categorical values with mode and numerical values with median

In [None]:
def fill_with_mode(dataframe, colname):
    mode = dataframe[colname].mode()[0]
    dataframe[colname].fillna(mode, inplace=True)
for i in train.select_dtypes(include='object').columns:
    fill_with_mode(train,i)
for j in test.select_dtypes(include='object').columns:
    fill_with_mode(test,j)

In [None]:
def fill_with_median(dataframe, colname):
    mode = dataframe[colname].mode()[0]
    dataframe[colname].fillna(mode, inplace=True)
for i in train.select_dtypes(include=['float64', 'int64']).columns:
    fill_with_median(train,i)
for j in test.select_dtypes(include=['float64', 'int64']).columns:
    fill_with_median(test,j)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Now we are good to go with no more missing values :)

## 3. Do some EDA now

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Distribution of unique values of all the features on the basis of target --->

In [None]:
plt.figure(figsize=[18,20])
variables= ['gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job']
total = float(len(train))
n=1
for v in variables:
    plt.subplot(3,3,n)
    ax=sns.countplot(x = v, data = train,hue='target', alpha=0.7, edgecolor='black', palette='pastel')
    sns.set(style="whitegrid")
    plt.subplots_adjust(hspace=0.3)
    plt.title('distribution of unique values of {} with target'.format(v))
    plt.xticks(fontsize=8)
    plt.subplots_adjust(wspace=0.3)
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width()
        y = p.get_height()
        ax.annotate(percentage, (x, y),ha='center')
    n=n+1
    plt.tight_layout()
    sns.despine()

plt.show()


In [None]:
plt.figure(figsize=[12,4])
sns.distplot(train['city_development_index'], color='red')
plt.title('Distribution of City Development Index')
plt.show()

In [None]:
plt.figure(figsize=[30,10])
sns.countplot(x='city', data=train)
plt.title('Countplot of unique values of city')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=[30,10])
sns.barplot(data=train, x='city', y='city_development_index')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=[30,10])
sns.distplot(train['training_hours'])
plt.title('Countplot of unique values of training hours', fontsize='30')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

### Heatmap of numerical features with target --->

In [None]:
plt.figure(figsize=(20,1))
sns.heatmap(train.corr().sort_values(by = ['target'], ascending = False).head(1), cmap='coolwarm', annot=True, annot_kws={'size': 8}, fmt = '.2f')

plt.title('Correlation of Numerical Features with the Target', weight = 'bold', fontsize = 18)
plt.xticks(weight='bold')
plt.yticks(weight='bold', color='dodgerblue', rotation=0)

plt.show()

In [None]:
sns.boxplot(x='target', y='city_development_index', data=train)
plt.show()

#### So Data Scientists living in the cities with higher CDI are less likely to change their jobs.

In [None]:
sns.boxplot(x='target', y='training_hours', data=train)
plt.show()

#### Training hours does not effect the probability of changing the job

## 4. Let's encode our categorical features using Label Encoder

In [None]:
# Making Dictionaries of ordinal features

gender_map = {
        'Female': 2,
        'Male': 1,
        'Other': 0
         }

relevent_experience_map = {
    'Has relevent experience':  1,
    'No relevent experience':    0
}

enrolled_university_map = {
    'no_enrollment'   :  0,
    'Full time course':    1, 
    'Part time course':    2 
}
    
education_level_map = {
    'Primary School' :    0,
    'Graduate'       :    2,
    'Masters'        :    3, 
    'High School'    :    1, 
    'Phd'            :    4
    } 
    
major_map ={ 
    'STEM'                   :    0,
    'Business Degree'        :    1, 
    'Arts'                   :    2, 
    'Humanities'             :    3, 
    'No Major'               :    4, 
    'Other'                  :    5 
}
    
experience_map = {
    '<1'      :    0,
    '1'       :    1, 
    '2'       :    2, 
    '3'       :    3, 
    '4'       :    4, 
    '5'       :    5,
    '6'       :    6,
    '7'       :    7,
    '8'       :    8, 
    '9'       :    9, 
    '10'      :    10, 
    '11'      :    11,
    '12'      :    12,
    '13'      :    13, 
    '14'      :    14, 
    '15'      :    15, 
    '16'      :    16,
    '17'      :    17,
    '18'      :    18,
    '19'      :    19, 
    '20'      :    20, 
    '>20'     :    21
} 
    
company_type_map = {
    'Pvt Ltd'               :    0,
    'Funded Startup'        :    1, 
    'Early Stage Startup'   :    2, 
    'Other'                 :    3, 
    'Public Sector'         :    4, 
    'NGO'                   :    5
}

company_size_map = {
    '<10'          :    0,
    '10/49'        :    1, 
    '100-500'      :    2, 
    '1000-4999'    :    3, 
    '10000+'       :    4, 
    '50-99'        :    5, 
    '500-999'      :    6, 
    '5000-9999'    :    7
}
    
last_new_job_map = {
    'never'        :    0,
    '1'            :    1, 
    '2'            :    2, 
    '3'            :    3, 
    '4'            :    4, 
    '>4'           :    5
}

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
# Transforming Categorical features into numerical features

train.loc[:,'education_level'] = train['education_level'].map(education_level_map)
train.loc[:,'company_size'] = train['company_size'].map(company_size_map)
train.loc[:,'company_type'] = train['company_type'].map(company_type_map)
train.loc[:,'last_new_job'] = train['last_new_job'].map(last_new_job_map)
train.loc[:,'major_discipline'] = train['major_discipline'].map(major_map)
train.loc[:,'enrolled_university'] = train['enrolled_university'].map(enrolled_university_map)
train.loc[:,'relevent_experience'] = train['relevent_experience'].map(relevent_experience_map)
train.loc[:,'gender'] = train['gender'].map(gender_map)
train.loc[:,'experience'] = train['experience'].map(experience_map)

#encoding city feature using label encoder
lb_en = LabelEncoder()

train.loc[:,'city'] = lb_en.fit_transform(train.loc[:,'city']) 
train.drop(['enrollee_id'], axis=1, inplace=True)

### Some more EDA --->

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.barplot(data=train, x='gender', y='relevent_experience')
plt.title('Barplots of gender vs relevent experience')
plt.subplot(1,2,2)
sns.barplot(data=train, x='gender', y='relevent_experience', hue='target')
plt.title('Distribution on the basis of target')

plt.show()

#### Here 0: others, 1: males, 2: females.

In [None]:
total = float(len(train))
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.barplot(data=train, x='gender', y='enrolled_university')
plt.title('Barplot of gender vs enrolled in university')
plt.subplot(1,2,2)
sns.barplot(data=train, x='gender', y='enrolled_university', hue='target')
plt.title('distribution on the basis of target')

plt.show()

In [None]:
total = float(len(train))
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.barplot(data=train, x='education_level', y='relevent_experience')
plt.title('barplot of education level vs relevent experience')
plt.subplot(1,2,2)
sns.barplot(data=train, x='education_level', y='relevent_experience', hue='target')
plt.title('distribution on the basis of target')

plt.show()

#### here 0:primary school, 1: high school, 2: graduate, 3: masters, 4: phd

In [None]:
total = float(len(train))
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.barplot(data=train, y='training_hours', x='gender')
plt.subplot(1,2,2)
sns.barplot(data=train, y='training_hours', x='gender', hue='target')

plt.show()

## 5. Modelling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
sns.countplot(x='target', data=train)
plt.show()

## Since the data is imbalanced, we are going to use SMOTE--->

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X = train.drop("target", axis=1)
y = train["target"]

In [None]:
oversample = SMOTE()
smote = SMOTE(random_state = 0)
X_smote, y_smote = smote.fit_resample(X,y)

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(y_smote.value_counts().index.astype(int),
            y_smote.value_counts().values, palette='bwr')
plt.ylabel('Number of rows', fontsize=12)
plt.xlabel('Target', fontsize=12)
plt.title('After sampling')
plt.show()

### Now the data is balanced. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_smote,
                                                    y_smote,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
gb = GaussianNB()
gb.fit(X_smote,y_smote)
print(classification_report(y_smote, gb.predict(X_smote)))

In [None]:
print(gb.score(X_smote, y_smote))

In [None]:
lr = LogisticRegression()
lr.fit(X_smote,y_smote)
print(classification_report(y_smote, lr.predict(X_smote)))

In [None]:
print(lr.score(X_smote, y_smote))

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_smote,y_smote)
print(classification_report(y_smote, rfc.predict(X_smote)))

In [None]:
print(rfc.score(X_smote, y_smote))

In [None]:
# Transforming Categorical features into numerical features

test.loc[:,'education_level'] = test['education_level'].map(education_level_map)
test.loc[:,'company_size'] = test['company_size'].map(company_size_map)
test.loc[:,'company_type'] = test['company_type'].map(company_type_map)
test.loc[:,'last_new_job'] = test['last_new_job'].map(last_new_job_map)
test.loc[:,'major_discipline'] = test['major_discipline'].map(major_map)
test.loc[:,'enrolled_university'] = test['enrolled_university'].map(enrolled_university_map)
test.loc[:,'relevent_experience'] = test['relevent_experience'].map(relevent_experience_map)
test.loc[:,'gender'] = test['gender'].map(gender_map)
test.loc[:,'experience'] = test['experience'].map(experience_map)

#encoding city feature using label encoder
lb_en = LabelEncoder()

test.loc[:,'city'] = lb_en.fit_transform(test.loc[:,'city']) 

In [None]:
test.head()

In [None]:
df_test=test.drop(["enrollee_id"], axis=1)

In [None]:
predictions=rfc.predict(df_test.values)

In [None]:
#Create a  DataFrame
submission = pd.DataFrame({'enrollee_id':test['enrollee_id'],'target':predictions})
                        

#Visualize the first 10 rows
submission.head(10)

In [None]:
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)