In [None]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Training Data

In [None]:
train= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train.head()

## Features

* enrollee_id : Unique ID for enrollee
* city: City code
* citydevelopmentindex: Developement index of the city (scaled)
* gender: Gender of enrolee
* relevent_experience: Relevent experience of enrolee
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of enrolee
* major_discipline :Education major discipline of enrolee
* experience: Enrolee total experience in years
* company_size: No of employees in current employer's company
* company_type : Type of current employer
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed
* target: 0 – Not looking for job change, 1 – Looking for a job change

# Visualization

Here I did compare the features that might influence the move to a new job and get the max, min, and mean of the features


In [None]:
display(train[['city','city_development_index','relevent_experience','gender','education_level','major_discipline','experience','company_size','company_type','target']].groupby(['gender','education_level','experience','company_size']).agg(["max",'mean',"min"]).style.background_gradient(cmap="Oranges"))


# Bar plots 

showing the frequency of each category separated by label

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft=["relevent_experience", "education_level","major_discipline", "experience","company_size","company_type", "training_hours","target"]
n=1
for f in fft:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='education_level', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by education_level".format(f))
    n=n+1
plt.tight_layout()
plt.show()


    
plt.figure(figsize=[15,4])
sns.countplot(x='experience', hue='education_level',edgecolor="black", alpha=0.7, data=train)
sns.despine()
plt.title("Countplot of experience by education_level")
plt.show()

# Target

* 0 – Not looking for job change, 
* 1 – Looking for a job change

As you can see, here we have imbalanced data, the number of 1 ( Looking for a job change) **<** 0 (Not looking for job change)

In [None]:
mnj = train['target'].value_counts()  
plt.figure(figsize=(6,4))
sns.barplot(mnj.index, mnj.values, alpha=0.8)
plt.ylabel('Number of Data', fontsize=12)
plt.xlabel('target', fontsize=9)
plt.xticks(rotation=90)
plt.show();

# Education Level

This dataset contains 5 education level:
* Graduate
* Masters
* High School
* PhD
* Primary School


In [None]:
EL = train['education_level'].value_counts()  
plt.figure(figsize=(6,4))
sns.barplot(EL.index, EL.values, alpha=0.8)
plt.ylabel('Number of Data', fontsize=12)
plt.xlabel('education_level', fontsize=9)
plt.xticks(rotation=90)
plt.show();

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

el = train['education_level'].value_counts().reset_index()
el.columns = [
    'education_level', 
    'percent'
]
el['percent'] /= len(train)

fig = px.pie(
    el, 
    names='education_level', 
    values='percent', 
    title='Education_level', 
    width=800,
    height=500 
)

fig.show()

# education_level:training_hours

In [None]:
et = train.sort_values(by='training_hours', ascending=True)[:7000]
figure = plt.figure(figsize=(10,6))
sns.barplot(y=et.education_level, x=et.training_hours)
plt.xticks()
plt.xlabel('training_hours')
plt.ylabel('education_level')
plt.title('education_level:training_hours ')
plt.show()

# City development index

The City Development Index was developed for the Second United Nations Conference on Human Settlements (Habitat II) in 1996 and measures the level of development in cities. The Urban Indicators Programme of the United Nations Human Settlements Programme (UN-Habitat) developed the indicator so that they could rank cities of the world according to their level of development and as a display of indicators depicting development. The CDI cuts across the different clusters identified in the Urban Indicator Framework as it is based on five sub indices namely, infrastructure, waste, health, education and city product. It is useful as it provides a snap-shot view of how cities are doing with respect to the different indices. It was invented by Dr Joe Flood, first Coordinator of the Urban Indicators Program, following a statistical analysis of city indicators data. Reference :https://en.wikipedia.org/wiki/City_development_index


## Calculating the CDI



<img src="https://www.researchgate.net/profile/Lubna_Hasan/publication/24115086/figure/tbl4/AS:668624478019607@1536423906970/Calculation-of-CDI-by-UN-HABITAT-GUIP-Index-Formula.png" width="600">

## Count: City development index

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

cd = train['city_development_index'].value_counts().reset_index()
cd.columns = [
    'city_development_index', 
    'count'
]
cd['city_development_index'] = cd['city_development_index'].astype(str) + '-'
cd = cd.sort_values(['count']).tail(50)

fig = px.bar(
    cd, 
    x='count', 
    y='city_development_index', 
    orientation='h', 
    title='Count: City development index', 
    width=1000,
    height=900 
)

fig.show()

# City by city development index

In [None]:
cdi = train.sort_values(by='city_development_index', ascending=True)[:2000]
figure = plt.figure(figsize=(10,6))
sns.barplot(y=cdi.city, x=cdi.city_development_index)
plt.xticks()
plt.xlabel('city_development_index')
plt.ylabel('city')
plt.title('City by city development index')
plt.show()

In [None]:
f, axes = plt.subplots(1,1, figsize = (16, 5))
g1 = sns.distplot(train["city_development_index"], color="red",ax = axes)
plt.title("Distributional of city_development_index")

# Experience

Enrolee total experience in years

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

ep = train['experience'].value_counts().reset_index()
ep.columns = [
    'experience', 
    'percent'
]
ep['percent'] /= len(train)

fig = px.pie(
    ep, 
    names='experience', 
    values='percent', 
    title='Experience', 
    width=800,
    height=500 
)

fig.show()

# Taining_hours

In [None]:
f, axes = plt.subplots(1,1, figsize = (16, 5))
g1 = sns.distplot(train["training_hours"], color="blue",ax = axes)
plt.title("Distributional of training_hours")

## Okay, let's check features that might influence

In [None]:
def wmnj(x):
    y = train[["enrollee_id","city","city_development_index","gender","relevent_experience","enrolled_university","education_level","major_discipline","experience","company_size","company_type","last_new_job","training_hours","target"]][train["education_level"] == x]
    y = y.sort_values(by="enrollee_id",ascending=False)
    return y.head(15)

# Graduate

Start from graduate, here I found :
* If **city_103** --> city_development_index: **0.920** --> the gender is male --> has relevent experience and experience **> 20** -->  move to new job (1)
* If **city_21** --> city_development_index: **0.624** --> the gender is male --> has relevent experience and experience **4** -->  move to new job (1)
* If **city_19** --> city_development_index: **0.682** --> the gender is male --> has relevent experience and  experience **6** -->  move to new job (1)

In [None]:
wmnj("Graduate")

# Masters

Here I found :
* If **city_136** --> city_development_index: **0.897** --> the gender is male --> has relevent experience and experience **6** -->  move to new job (1)

* If **city_103** --> city_development_index: **0.920** --> the gender is male --> has relevent experience and experience **17** --> move to new job (1)

* If **city_159**	 --> city_development_index: **0.843** --> the gender is male --> has relevent experience and experience **7** --> move to new job (1)

* If **city_50**	 --> city_development_index: **0.896** --> the gender is male --> has relevent experience and experience **10** --> move to new job (1)

* If **city_106** --> city_development_index: **0.698** --> the gender is male --> has relevent experience and experience **8** --> move to new job (1)

* If **city_21**	 --> city_development_index: **0.624** --> the gender is male --> has relevent experience and experience **14** --> move to new job (1)

The last one is different from other
* **city_21** --> city_development_index: **0.624** --> the gender is male --> **No relevent experience** and experience **2** --> move to new job (1)

In [None]:
wmnj("Masters")

# High School

Here I found :
* If **city_99** --> city_development_index: **0.915** --> the gender is male --> has relevent experience and experience **14** -->  move to new job (1)

* If **city_100** --> city_development_index: **0.887** --> the gender is male --> has relevent experience and experience **>20** --> move to new job (1)

* If **city_160**	 --> city_development_index: **0.920**	 --> the gender is male --> has relevent experience and experience **9** --> move to new job (1)

* If **city_73**	 --> city_development_index: **0.754** --> the gender is male --> has relevent experience and experience **3** --> move to new job (1)

* If **city_21**	 --> city_development_index: **0.624** --> the gender is male --> has relevent experience and experience **4** --> move to new job (1)



In [None]:
wmnj("High School")

# PhD

Here I found :

* If **city_42** --> city_development_index: **0.563** --> the gender is male --> has relevent experience and experience **< 1** --> move to new job (1)

* If **city_103**	 --> city_development_index:** 0.920** --> the gender is male --> **No relevent experience** and experience **> 20** --> move to new job (1)

* If **city_16**	 --> city_development_index: **0.910** --> the gender is female --> has relevent experience and experience **9** --> move to new job (1)

In [None]:
wmnj("Phd")

# Primary School


Here I found :

* If **city_70** --> city_development_index:** 0.698** --> the gender is male --> **No relevent experience** and experience **< 1** --> move to new job (1)

* If **city_103**	 --> city_development_index: **0.920** --> the gender is other --> **No relevent experience** and experience **3** --> move to new job (1)

* If **city_103**	 --> city_development_index: **0.920** --> the gender is female --> **No relevent experience** and experience **2** --> move to new job (1)

* If **city_126**	 --> city_development_index: **0.479** --> the gender is female --> Has relevent experience and experience **19** --> move to new job (1)

In [None]:
wmnj("Primary School")

# Correlation in Data



Here I try to measure correlation in data using Correlation coefficients.

Correlation coefficientsare used to measure how strong a relationship is between two variables.Correlation coefficient formulas are used to find how strong a relationship is between data. The formulas return a value between -1 and 1, where:
 
* 1 indicates a strong positive relationship.
* -1 indicates a strong negative relationship.
* A result of zero indicates no relationship at all.

![https://www.statisticshowto.com/wp-content/uploads/2012/10/pearson-2-small.png](https://www.statisticshowto.com/wp-content/uploads/2012/10/pearson-2-small.png)

reference : https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/#Pearson

In [None]:
corr=train.corr()["target"]
corr[np.argsort(corr, axis=0)[:-1]]


# Plotting correlations



In [None]:
#plotting correlations
num_feat=train.columns[train.dtypes!=object]
num_feat=num_feat [:-1]
labels = []
values = []
for col in num_feat:
    labels.append(col)
    values.append(np.corrcoef(train[col].values, train.target.values)[0,1])
    
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(8,15))
rects = ax.barh(ind, np.array(values), color='skyblue')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation Coefficients each feature with target");

# Prepocessing 


Here I creating a function that converts all values of gender and etc below into numbers (category)
* ['gender'], ['relevent_experience'] , ['enrolled_university'],
* ['education_level'], ['major_discipline'], ['experience'], ['company_type'], ['company_size']
* ['last_new_job'], ['city'] 

In [None]:
# Creating a function that converts all values of , ['gender'], ['relevent_experience'] , ['enrolled_university'],
# , ['education_level'], ['major_discipline'], ['experience'], ['company_type'],  ['company_size']
# , ['last_new_job'], ['city'] into numbers

def gender_to_numeric(x):
    if x=='Female': return 2
    if x=='Male':   return 1
    if x=='Other':   return 0
    
def rel_experience(x):
    if x=='Has relevent experience': return 1
    if x=='No relevent experience':   return 0
    
def enrollment(x):
    if x=='no_enrollment'   : return 0
    if x=='Full time course':   return 1 
    if x=='Part time course':   return 2 
    
def edu_level(x):
    if x=='Graduate'       :   return 0
    if x=='Masters'        :   return 1 
    if x=='High School'    :   return 2 
    if x=='Phd'            :   return 3 
    if x=='Primary School' :   return 4 
    
def major(x):
    if x=='STEM'                   :   return 0
    if x=='Business Degree'        :   return 1 
    if x=='Arts'                   :   return 2 
    if x=='Humanities'             :   return 3 
    if x=='No Major'               :   return 4 
    if x=='Other'                  :   return 5 
    
def experience(x):
    if x=='<1'      :   return 0
    if x=='1'       :   return 1 
    if x=='2'       :   return 2 
    if x=='3'       :   return 3 
    if x=='4'       :   return 4 
    if x=='5'       :   return 5
    if x=='6'       :   return 6
    if x=='7'       :   return 7
    if x=='8'       :   return 8 
    if x=='9'       :   return 9 
    if x=='10'      :   return 10 
    if x=='11'      :   return 11
    if x=='12'      :   return 12
    if x=='13'      :   return 13 
    if x=='14'      :   return 14 
    if x=='15'      :   return 15 
    if x=='16'      :   return 16
    if x=='17'      :   return 17
    if x=='18'      :   return 18
    if x=='19'      :   return 19 
    if x=='20'      :   return 20 
    if x=='>20'     :   return 21 
    
def company_t(x):
    if x=='Pvt Ltd'               :   return 0
    if x=='Funded Startup'        :   return 1 
    if x=='Early Stage Startup'   :   return 2 
    if x=='Other'                 :   return 3 
    if x=='Public Sector'         :   return 4 
    if x=='NGO'                   :   return 5 
    
def company_s(x):
    if x=='<10'          :   return 0
    if x=='10/49'        :   return 1 
    if x=='100-500'      :   return 2 
    if x=='1000-4999'    :   return 3 
    if x=='10000+'       :   return 4 
    if x=='50-99'        :   return 5 
    if x=='500-999'      :   return 6 
    if x=='5000-9999'    :   return 7
    
def last_job(x):
    if x=='never'        :   return 0
    if x=='1'            :   return 1 
    if x=='2'            :   return 2 
    if x=='3'            :   return 3 
    if x=='4'            :   return 4 
    if x=='>4'           :   return 5 
    
def city(x):
    if x=='city_103'         : return  0
    if x=='city_40'          : return  1
    if x=='city_21'          : return  2
    if x=='city_115'         : return  3
    if x=='city_162'         : return  4
    if x=='city_176'         : return  5
    if x=='city_160'         : return  6
    if x=='city_46'          : return  7
    if x=='city_61'          : return  8
    if x=='city_114'         : return  9
    if x=='city_13'          : return  10
    if x=='city_159'         : return  11
    if x=='city_102'         : return  12
    if x=='city_67'          : return  13
    if x=='city_100'         : return  14
    if x=='city_16'          : return  15
    if x=='city_71'          : return  16
    if x=='city_104'         : return  17
    if x=='city_64'          : return  18
    if x=='city_101'         : return  19
    if x=='city_83'          : return  20
    if x=='city_105'         : return  21
    if x=='city_73'          : return  22
    if x=='city_75'          : return  23
    if x=='city_41'          : return  24
    if x=='city_11'          : return  25
    if x=='city_93'          : return  26
    if x=='city_90'          : return  27
    if x=='city_36'          : return  28
    if x=='city_20'          : return  29
    if x=='city_57'          : return  30
    if x=='city_152'         : return  31
    if x=='city_19'          : return  32
    if x=='city_65'          : return  33
    if x=='city_74'          : return  34
    if x=='city_173'         : return  35
    if x=='city_136'         : return  36
    if x=='city_98'          : return  37
    if x=='city_97'          : return  38
    if x=='city_50'          : return  39
    if x=='city_138'         : return  40
    if x=='city_82'          : return  41
    if x=='city_157'         : return  42
    if x=='city_89'          : return  43
    if x=='city_150'         : return  44
    if x=='city_70'          : return  45
    if x=='city_175'         : return  46
    if x=='city_94'          : return  47
    if x=='city_28'          : return  48
    if x=='city_59'          : return  49
    if x=='city_165'         : return  50
    if x=='city_145'         : return  51
    if x=='city_142'         : return  52
    if x=='city_26'          : return  53
    if x=='city_12'          : return  54
    if x=='city_37'          : return  55
    if x=='city_43'          : return  56
    if x=='city_116'         : return  57
    if x=='city_23'          : return  58
    if x=='city_99'          : return  59
    if x=='city_149'         : return  60
    if x=='city_10'          : return  61
    if x=='city_45'          : return  62
    if x=='city_80'          : return  63
    if x=='city_128'         : return  64
    if x=='city_158'         : return  65
    if x=='city_123'         : return  66
    if x=='city_7'           : return  67
    if x=='city_72'          : return  68
    if x=='city_106'         : return  69
    if x=='city_143'         : return  70
    if x=='city_78'          : return  71
    if x=='city_109'         : return  72
    if x=='city_24'          : return  73
    if x=='city_134'         : return  74
    if x=='city_48'          : return  75
    if x=='city_144'         : return  76
    if x=='city_91'          : return  77
    if x=='city_146'         : return  78
    if x=='city_133'         : return  79
    if x=='city_126'         : return  80
    if x=='city_118'         : return  81
    if x=='city_9'           : return  82
    if x=='city_167'         : return  83
    if x=='city_27'          : return  84
    if x=='city_84'          : return  85
    if x=='city_54'          : return  86
    if x=='city_39'          : return  87
    if x=='city_79'          : return  88
    if x=='city_76'          : return  89
    if x=='city_77'          : return  90
    if x=='city_81'          : return  91
    if x=='city_131'         : return  92
    if x=='city_44'          : return  93
    if x=='city_117'         : return  94
    if x=='city_155'         : return  95
    if x=='city_33'          : return  96
    if x=='city_141'         : return  97
    if x=='city_127'         : return  98
    if x=='city_62'          : return  99
    if x=='city_53'          : return  100
    if x=='city_25'          : return  101
    if x=='city_2'           : return  102
    if x=='city_69'          : return  103
    if x=='city_120'         : return  104
    if x=='city_111'         : return  105
    if x=='city_30'          : return  106
    if x=='city_1'           : return  107
    if x=='city_140'         : return  108
    if x=='city_179'         : return  109
    if x=='city_55'          : return  110
    if x=='city_14'          : return  111
    if x=='city_42'          : return  112
    if x=='city_107'         : return  113
    if x=='city_18'          : return  114
    if x=='city_139'         : return  115
    if x=='city_180'         : return  116
    if x=='city_166'         : return  117
    if x=='city_121'         : return  118
    if x=='city_129'         : return  119
    if x=='city_8'           : return  120
    if x=='city_31'          : return  121
    if x=='city_171'         : return  122

In [None]:
train['gender'] = train['gender'].apply(gender_to_numeric)
train['relevent_experience'] = train['relevent_experience'].apply(rel_experience)
train['enrolled_university'] = train['enrolled_university'].apply(enrollment)
train['education_level'] = train['education_level'].apply(edu_level)
train['major_discipline'] = train['major_discipline'].apply(major)
train['experience'] = train['experience'].apply(experience)
train['company_type'] = train['company_type'].apply(company_t)
train['company_size'] = train['company_size'].apply(company_s)
train['last_new_job'] = train['last_new_job'].apply(last_job)
train['city'] = train['city'].apply(city)

train

## Check Missing Value

In [None]:
print("Any missing sample in training set:",train.isnull().values.any())


## Train : Replace nan values with average of columns


In [None]:
train['gender'] = train['gender'].fillna((train['gender'].mean()))
train['enrolled_university'] = train['enrolled_university'].fillna((train['enrolled_university'].mean()))
train['major_discipline'] = train['major_discipline'].fillna((train['major_discipline'].mean()))
train['company_size'] = train['company_size'].fillna((train['company_size'].mean()))
train['company_type'] = train['company_type'].fillna((train['company_type'].mean()))
train['company_type'] = train['company_type'].fillna((train['company_type'].mean()))

train

# Testing Data

In [None]:
test= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
test.head()


In [None]:
test['gender'] = test['gender'].apply(gender_to_numeric)
test['relevent_experience'] = test['relevent_experience'].apply(rel_experience)
test['enrolled_university'] = test['enrolled_university'].apply(enrollment)
test['education_level'] = test['education_level'].apply(edu_level)
test['major_discipline'] = test['major_discipline'].apply(major)
test['experience'] = test['experience'].apply(experience)
test['company_type'] = test['company_type'].apply(company_t)
test['company_size'] = test['company_size'].apply(company_s)
test['last_new_job'] = test['last_new_job'].apply(last_job)
test['city'] = test['city'].apply(city)

test

In [None]:
print("Any missing sample in test set:",test.isnull().values.any(), "\n")

## Test : Replace nan values with average of columns

In [None]:

test['gender'] = test['gender'].fillna((test['gender'].mean()))
test['enrolled_university'] = test['enrolled_university'].fillna((test['enrolled_university'].mean()))
test['major_discipline'] = test['major_discipline'].fillna((test['major_discipline'].mean()))
test['company_size'] = test['company_size'].fillna((test['company_size'].mean()))
test['company_type'] = test['company_type'].fillna((test['company_type'].mean()))
test['company_type'] = test['company_type'].fillna((test['company_type'].mean()))

test

In [None]:
#Select feature column names and target variable we are going to use for training

features =["enrollee_id","city","city_development_index","gender","relevent_experience","enrolled_university","education_level","major_discipline","experience","company_size","company_type","last_new_job","training_hours"]
target = 'target'

In [None]:
#This is input which our classifier will use as an input.
train[features].head(10)

In [None]:
#Display first 10 target variables
train[target].head(100).values

# Model

CatBoost is an algorithm for gradient boosting on decision trees. It is developed by Yandex researchers and engineers, and is used for search, recommendation systems, personal assistant, self-driving cars, weather prediction and many other tasks at Yandex and in other companies, including CERN, Cloudflare, Careem taxi. It is in open-source and can be used by anyone.
Reference : https://catboost.ai/

In [None]:
from sklearn.model_selection import train_test_split
Y = train['target']
X = train.drop(columns=['target'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=9)


In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score


# We define the model
model = CatBoostRegressor(objective='RMSE')

# We train model
model.fit(X_train, Y_train)

# Applying the K-Fold Cross Validation for CatBoostRegressor

In [None]:
#Applying the K-Fold Cross Validation for CatBoostRegressor
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = Y_train, cv = 5)
print("Accuracy:{:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation:{:.2f} %".format(accuracies.std()*100))


# Prediction

In [None]:
# We predict target values (Split 15% from training data)
Y_predict = model.predict(X_test)
Y_predict

In [None]:
#Make predictions using the features from the test data set
predictions = model.predict(test[features])

predictions

# Measure AUC


The AUC is an estimate of the probability that a classifier will rank a randomly chosen positive instance higher than a randomly chosen negative instance. Reference : https://www.kdnuggets.com/2010/09/pub-is-auc-the-best-measure.html#:~:text=www.riceanalytics.com-,The%20area%20under%20the%20curve%20(AUC)%20that%20relates%20the%20hit,a%20randomly%20chosen%20negative%20instance.

In [None]:
#Test (Split 15% from training data)
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_predict)
metrics.auc(fpr, tpr)

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(train[target],  model.predict(train[features]))
metrics.auc(fpr, tpr)

In [None]:
#Create a  DataFrame
submission = pd.DataFrame({'enrollee_id':test['enrollee_id'],'target':predictions})
                        

#Visualize the first 10 rows
submission.head(10)

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)