[View in Colaboratory](https://colab.research.google.com/github/avs20/GNoidaMLBatch/blob/master/AnalyticsVidhyaWNSclass.ipynb)

## How to start with Machine Learning Competitions

This notebook is about how to start working with the data on competitions and how to get a good start at the beginning. 

This notebook was prepared live for the class at Sio Labs and more optimization and tuning can be done from this level. 

## Load Data


In [1]:
import pandas as pd
import numpy as np


df_raw = pd.read_csv('train_LZdllcl.csv')
df_raw.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [2]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
employee_id             54808 non-null int64
department              54808 non-null object
region                  54808 non-null object
education               52399 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    50684 non-null float64
length_of_service       54808 non-null int64
KPIs_met >80%           54808 non-null int64
awards_won?             54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


education and previous_year_rating have missing values

In [4]:
df_raw['is_promoted'].describe()

count    54808.000000
mean         0.085170
std          0.279137
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: is_promoted, dtype: float64

In [5]:
df_raw['age'].describe()

count    54808.000000
mean        34.803915
std          7.660169
min         20.000000
25%         29.000000
50%         33.000000
75%         39.000000
max         60.000000
Name: age, dtype: float64

In [6]:
df_raw['is_promoted'].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

## Split the data

In [9]:
X = df_raw.drop('is_promoted', axis=1)
y = df_raw.is_promoted


train_x = X.values[:45000,:]
train_y = y[:45000]

test_x = X.values[45000:,:]
test_y = y[45000:]

print(train_x.shape)
print(train_y.shape)

print(test_x.shape)
print(test_y.shape)

(45000, 13)
(45000,)
(9808, 13)
(9808,)


In [10]:
print(test_y.value_counts())

0    8959
1     849
Name: is_promoted, dtype: int64


In [11]:
print(train_y.value_counts())

0    41181
1     3819
Name: is_promoted, dtype: int64


## Train the model

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(train_x, train_y)

predictions = model.predict(test_x)

from sklearn.metric import f1_score

f1_score(test_y, predictions)

ValueError: ignored

In [13]:
df_raw.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [15]:
df_raw[df_raw.education.isnull()]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
10,29934,Technology,region_23,,m,sourcing,1,30,,1,0,0,77,0
21,33332,Operations,region_15,,m,sourcing,1,41,4.0,11,0,0,57,0
32,35465,Sales & Marketing,region_7,,f,sourcing,1,24,1.0,2,0,0,48,0
43,17423,Sales & Marketing,region_2,,m,other,3,24,2.0,2,0,0,48,0
82,66013,Sales & Marketing,region_2,,m,sourcing,2,25,3.0,2,0,0,53,0
87,69094,Sales & Marketing,region_2,,m,sourcing,1,39,1.0,9,0,0,49,0
90,62658,Sales & Marketing,region_2,,f,sourcing,1,20,,1,0,0,55,0
189,6254,Operations,region_2,,f,other,1,33,4.0,9,0,0,64,0
204,60761,Operations,region_16,,f,other,1,31,4.0,2,0,0,62,0
231,57235,Sales & Marketing,region_26,,m,other,1,22,4.0,2,0,0,51,0


In [16]:
df_raw['education'].value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [17]:
df_raw['education'].fillna("Bachelor's", inplace=True)
df_raw[df_raw.education.isnull()]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted


In [18]:
df_raw[df_raw.previous_year_rating.isnull()]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
10,29934,Technology,region_23,Bachelor's,m,sourcing,1,30,,1,0,0,77,0
23,71177,Procurement,region_5,Bachelor's,m,other,1,27,,1,0,0,70,0
29,74759,Sales & Marketing,region_4,Bachelor's,m,sourcing,1,26,,1,0,0,44,0
56,45709,Sales & Marketing,region_31,Bachelor's,f,other,1,29,,1,0,0,49,0
58,26599,Sales & Marketing,region_16,Bachelor's,m,other,2,27,,1,1,0,47,0
62,9150,Analytics,region_22,Bachelor's,f,other,1,28,,1,1,0,80,0
66,77981,Finance,region_22,Bachelor's,m,other,1,27,,1,1,1,58,1
67,16502,Sales & Marketing,region_22,Bachelor's,m,sourcing,1,27,,1,0,0,61,1
84,44575,Legal,region_7,Bachelor's,m,other,1,29,,1,0,0,65,1
89,9589,Sales & Marketing,region_31,Bachelor's,f,other,1,31,,1,0,0,51,0


In [20]:
df_raw.previous_year_rating.value_counts()

3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: previous_year_rating, dtype: int64

In [21]:
df_raw.previous_year_rating.fillna(3.0, inplace=True)
df_raw[df_raw.previous_year_rating.isnull()]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted


In [22]:
df_raw.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

*Missing values done now to catoegorical variables*

In [23]:
df_raw.education.value_counts()

Bachelor's          39078
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [25]:
pd.get_dummies(df_raw.education, drop_first=True)

Unnamed: 0,Below Secondary,Master's & above
0,0,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,1
8,0,0
9,0,1


In [26]:
print(len(df_raw.columns))

df_raw = pd.get_dummies(df_raw, drop_first=True)

print(len(df_raw.columns))


14
55


In [27]:
df_raw.head()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_Finance,...,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,gender_m,recruitment_channel_referred,recruitment_channel_sourcing
0,65438,1,35,5.0,8,1,0,49,0,0,...,0,0,1,0,0,0,1,0,0,1
1,65141,1,30,5.0,4,0,0,60,0,0,...,0,0,0,0,0,0,0,1,0,0
2,7513,1,34,3.0,7,0,0,50,0,0,...,0,0,0,0,0,0,0,1,0,1
3,2542,2,39,1.0,10,0,0,50,0,0,...,0,0,0,0,0,0,0,1,0,0
4,48945,1,45,3.0,2,0,0,73,0,0,...,0,0,0,0,0,0,0,1,0,0


In [0]:
#save dataset 
df_raw.to_csv('processed_data.csv', index=False)

In [29]:
X = df_raw.drop('is_promoted', axis=1)
y = df_raw.is_promoted


train_x = X.values[:45000,:]
train_y = y[:45000]

test_x = X.values[45000:,:]
test_y = y[45000:]

print(train_x.shape)
print(train_y.shape)

print(test_x.shape)
print(test_y.shape)

(45000, 54)
(45000,)
(9808, 54)
(9808,)


In [31]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(train_x, train_y)

predictions = model.predict(test_x)

from sklearn.metrics import f1_score

f1_score(test_y, predictions)

0.37556154537286607

### Tuning the model

In [39]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4224598930481283

In [40]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=40, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.42424242424242425

In [44]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4224598930481283

In [45]:
model = RandomForestClassifier(n_jobs=-1,max_depth=3, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.007042253521126761

In [50]:
model = RandomForestClassifier(n_jobs=-1,max_depth=25, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.432286995515695

In [51]:
model = RandomForestClassifier(n_jobs=-1, max_features = 0.5, max_depth=25, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4805414551607445

In [55]:
model = RandomForestClassifier(n_jobs=-1, max_features = 0.7, max_depth=25, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4847986852917009

In [60]:
model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', min_samples_split=10, max_features = 0.7, max_depth=25, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4878457669740151

In [65]:
model = RandomForestClassifier(n_jobs=-1, min_samples_split=10, max_features = 0.7, max_depth=25, n_estimators=20, random_state=42)

model.fit(train_x, train_y)

predictions = model.predict(test_x)


f1_score(test_y, predictions)

0.4878457669740151

## Generate submission file for website



In [66]:
test_df = pd.read_csv('test_2umaH9m.csv')
test_df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [67]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 13 columns):
employee_id             23490 non-null int64
department              23490 non-null object
region                  23490 non-null object
education               22456 non-null object
gender                  23490 non-null object
recruitment_channel     23490 non-null object
no_of_trainings         23490 non-null int64
age                     23490 non-null int64
previous_year_rating    21678 non-null float64
length_of_service       23490 non-null int64
KPIs_met >80%           23490 non-null int64
awards_won?             23490 non-null int64
avg_training_score      23490 non-null int64
dtypes: float64(1), int64(7), object(5)
memory usage: 2.3+ MB


In [68]:
test_df.education.fillna("Bachelor's", inplace=True)
test_df.previous_year_rating.fillna(3.0,inplace=True)
test_df.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [69]:
test_df = pd.get_dummies(test_df, drop_first=True)
len(test_df.columns)

54

In [71]:
predictions = model.predict(test_df)

employee_id = list(test_df.employee_id)


submit_dict = {'employee_id' : employee_id,
               'is_promoted' : predictions }

submit_df = pd.DataFrame(submit_dict)

submit_df.head()

  


Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [0]:
submit_df.to_csv('submission10.csv', index=False)

## Assignment


Participate in as many competitions as you can for the next 4 days and see how good you can get them. Good website for competitions is [Kaggle](htttps://www.kaggle.com)

## References

1. [AnalyticsVidhya Competition](https://datahack.analyticsvidhya.com/contest/wns-analytics-hackathon-2018/)
2. [F1-Score](https://machinelearningmastery.com/classification-accuracy-is-not-enough-more-performance-measures-you-can-use/)
3. [RandomForestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
