In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
from sklearn.metrics import  mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

In [3]:
%matplotlib inline

In [4]:
dft= pd.read_csv ('train.csv', delimiter='|')
dfp= pd.read_csv('predict-case.csv', delimiter='|')

In [5]:
# Load Train Set
print(f"Train set shape:\n{dft.shape}\n")

#Load Predict Set
print(f"Predict set shape :\n{dfp.shape}\n")

Train set shape:
(31746, 15)

Predict set shape :
(3000, 14)



In [6]:
dft.isnull().sum()

id                          0
job_title                   0
location                    0
salary_currency             4
career_level                0
experience_level         4292
education_level             0
employment_type          1344
job_function                0
job_benefits             6667
company_process_time     9144
company_size             5163
company_industry         1514
job_description             1
salary                  25394
dtype: int64

In [7]:
dfp['salary']= ""


In [8]:
dft1 = dft[~(dft.salary.isnull())].reset_index(drop = True)
del dft1['id']

dft1.shape

(6352, 14)

In [9]:
dft1[dft1.salary_currency!='IDR']

Unnamed: 0,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
31,Digital and Social Media Executive (Travel & L...,Jakarta Pusat,USD,Pegawai (non-manajemen & non-supervisor),2 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Digital Marketing","Waktu regular, Senin - Jumat",29 days,1- 50 pekerja,Periklanan/Marketing/Promosi/Hubungan Masyarakat,DIGITAL / SOCIAL MEDIA EXECUTIVEWe are recruit...,665.0
6174,Registered Nurse Aesthetic and Wellness,Bali,USD,Pegawai (non-manajemen & non-supervisor),2 tahun,Sarjana (S1),Penuh Waktu,"Layanan Kesehatan,Praktisi/Asisten Medis",Tip;Bisnis (contoh: Kemeja);Mon - Sat,,1- 50 pekerja,Perawatan/Kecantikan/Fitnes,Registered Nurse Aesthetic and WellnessMaldive...,1005.0


In [10]:
dft1.salary = np.where(dft1.salary_currency!='IDR', dft1.salary*15000, dft1.salary)
dft2 = dft1[(dft1.salary>=200000)].reset_index(drop=True)
dft2.sample(1)


Unnamed: 0,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
1126,Network & System Supervisor - IT Consulting (A...,Jakarta Selatan,IDR,Supervisor/Koordinator,3 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Komputer/Teknologi Informasi,IT-Admin Jaringan...",,19 days,1- 50 pekerja,Manajemen/Konsulting HR,About The Company:The working venue is in Jaka...,10000000.0


In [11]:
dft2.shape

(6348, 14)

## Feature Engineering
* career_level
* experience_level
* education_level
* employment_type
* job_function
* job_benefits
* company_size
* company_industry
* job_description

In [12]:
dfp['employment_type'].replace('Penuh Waktu, Kontrak', 'Penuh Waktu',inplace=True)
dfp['employment_type'].replace('Penuh Waktu, Paruh Waktu', 'Penuh Waktu',inplace=True)
dfp['employment_type'].replace('Kontrak, Temporer', 'Kontrak',inplace=True)

In [13]:
dfp.groupby(['employment_type']).size()

employment_type
Kontrak         398
Magang            9
Paruh Waktu      26
Penuh Waktu    2555
Temporer         12
dtype: int64

In [14]:
dft2['employment_type'].replace('Penuh Waktu, Kontrak', 'Penuh Waktu',inplace=True)
dft2['employment_type'].replace('Penuh Waktu, Magang', 'Penuh Waktu',inplace=True)
dft2['employment_type'].replace('Penuh Waktu, Paruh Waktu', 'Penuh Waktu',inplace=True)

In [15]:
dft2.groupby(['employment_type']).size()

employment_type
Kontrak         817
Magang           26
Paruh Waktu      56
Penuh Waktu    5431
Temporer         18
dtype: int64

In [16]:
dft2.groupby(['experience_level']).size()

experience_level
1 tahun     2093
10 tahun      62
11 tahun       1
12 tahun       7
15 tahun       8
17 tahun       1
2 tahun     1741
20 tahun       3
3 tahun     1098
4 tahun      123
5 tahun      599
6 tahun       15
7 tahun       35
8 tahun       29
dtype: int64

In [17]:
dfp['experience_level'].replace('Lebih dari 20 Tahun', '21 tahun',inplace=True)
dfp['company_size'].replace('Lebihdari5000', '5000',inplace=True)
dft2['company_size'].replace('Lebihdari5000', '5000', inplace=True)

In [18]:
dfp.groupby(['experience_level']).size()

experience_level
1 tahun     988
10 tahun     28
12 tahun      1
15 tahun      3
2 tahun     796
20 tahun      2
21 tahun      1
3 tahun     540
4 tahun      55
5 tahun     299
6 tahun       9
7 tahun      17
8 tahun      12
dtype: int64

In [19]:
dfp.groupby(['company_size']).size()

company_size
1- 50 pekerja              925
1001 - 2000 pekerja        172
2001 - 5000 pekerja        101
201 - 500 pekerja          304
501 - 1000 pekerja         201
51 - 200 pekerja           720
Lebih dari 5000 pekerja     97
dtype: int64

In [20]:
dft2.groupby(['company_size']).size()

company_size
1- 50 pekerja              1889
1001 - 2000 pekerja         337
2001 - 5000 pekerja         198
201 - 500 pekerja           706
501 - 1000 pekerja          430
51 - 200 pekerja           1607
Lebih dari 5000 pekerja     237
dtype: int64

In [21]:
dft2.groupby(['experience_level']).size()

experience_level
1 tahun     2093
10 tahun      62
11 tahun       1
12 tahun       7
15 tahun       8
17 tahun       1
2 tahun     1741
20 tahun       3
3 tahun     1098
4 tahun      123
5 tahun      599
6 tahun       15
7 tahun       35
8 tahun       29
dtype: int64

In [22]:
#train
dft2['experience_level2'] = dft2['experience_level'].fillna('1 tahun')
dft2['f1'] = dft2['experience_level2'].apply(lambda x : x.split(' ')[0]).astype(int)

# Prediction Case
dfp['experience_level2'] = dfp['experience_level'].fillna('1 tahun')
dfp['f1'] = dfp['experience_level2'].apply(lambda x : x.split(' ')[0])

In [23]:
le = LabelEncoder()
le.fit(dft2.career_level)

# Prediction Case
le = LabelEncoder()
le.fit(dfp.career_level)

LabelEncoder()

In [24]:
dft2['f2'] = le.transform(dft2.career_level)
# Prediction Case
dfp['f2'] = le.transform(dfp.career_level)

In [25]:
dft2[['f1','f2','salary']]

Unnamed: 0,f1,f2,salary
0,1,4,10500000.0
1,3,4,8000000.0
2,1,3,4750000.0
3,2,3,5250000.0
4,2,2,15000000.0
...,...,...,...
6343,1,3,5100000.0
6344,1,3,2400000.0
6345,5,3,7700000.0
6346,1,4,8000000.0


In [26]:
enc = OneHotEncoder()
enc.fit(dft2[['f2']])

# Prediction Case
enc = OneHotEncoder()
enc.fit(dfp[['f2']])

OneHotEncoder()

In [27]:
#predict
new_cols = ['f2'+ str(i) for i in range(5)]
df_onehot = pd.DataFrame(enc.transform(dft2[['f2']]).toarray(), columns=new_cols)
df1 = pd.concat([dft2[['f1','salary']], df_onehot], axis = 1)

# Prediction Case
new_cols_pred = ['f2'+ str(i) for i in range(5)]
df_onehot_pred = pd.DataFrame(enc.transform(dfp[['f2']]).toarray(), columns=new_cols_pred)
dfp1 = pd.concat([dfp[['f1','salary']], df_onehot_pred], axis = 1)

In [28]:
df1.sample(1)

Unnamed: 0,f1,salary,f20,f21,f22,f23,f24
2680,3,6000000.0,0.0,0.0,0.0,0.0,1.0


In [29]:
dft2['company_size2'] = dft2['company_size'].fillna('1-50')

# Prediction Case
dfp['company_size2'] = dfp['company_size'].fillna('1-50')

In [30]:
dft2['company_size2'].replace(' pekerja','', regex=True, inplace=True)
dft2['company_size2'].replace(' ','', regex=True, inplace=True)

In [31]:
dfp['company_size2'].replace(' pekerja','', regex=True, inplace=True)
dfp['company_size2'].replace(' ','', regex=True, inplace=True)

In [32]:
le = LabelEncoder()
le.fit(dft2.company_size2)
dft2['cs'] = le.transform(dft2.company_size2)

le = LabelEncoder()
le.fit(dfp.company_size2)
dfp['cs'] = le.transform(dfp.company_size2)

In [33]:
enc = OneHotEncoder()
enc.fit(dft2[['cs']])

# Prediction Case
enc = OneHotEncoder()
enc.fit(dfp[['cs']])

OneHotEncoder()

In [34]:
new_cols1 = ['cs'+ str(i) for i in range(7)]
df_onehot1 = pd.DataFrame(enc.transform(dft2[['cs']]).toarray(), columns=new_cols1)
df2 = pd.concat([df1[['f1','f20', 'f21', 'f22', 'f23', 'f24','salary']], df_onehot1], axis = 1)
df2

Unnamed: 0,f1,f20,f21,f22,f23,f24,salary,cs0,cs1,cs2,cs3,cs4,cs5,cs6
0,1,0.0,0.0,0.0,0.0,1.0,10500000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3,0.0,0.0,0.0,0.0,1.0,8000000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.0,0.0,0.0,1.0,0.0,4750000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.0,0.0,0.0,1.0,0.0,5250000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0.0,0.0,1.0,0.0,0.0,15000000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,1,0.0,0.0,0.0,1.0,0.0,5100000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6344,1,0.0,0.0,0.0,1.0,0.0,2400000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6345,5,0.0,0.0,0.0,1.0,0.0,7700000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6346,1,0.0,0.0,0.0,0.0,1.0,8000000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
new_cols1_p = ['cs'+ str(i) for i in range(7)]
df_onehot1_p = pd.DataFrame(enc.transform(dfp[['cs']]).toarray(), columns=new_cols1_p)
dfp2 = pd.concat([dfp1[['f1','f20', 'f21', 'f22', 'f23', 'f24','salary']], df_onehot1_p], axis = 1)
dfp2.columns

Index(['f1', 'f20', 'f21', 'f22', 'f23', 'f24', 'salary', 'cs0', 'cs1', 'cs2',
       'cs3', 'cs4', 'cs5', 'cs6'],
      dtype='object')

In [40]:
dft2.head()

Unnamed: 0,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,experience_level2,f1,f2,company_size2,cs
0,KEPALA PABRIK,Balikpapan,IDR,Supervisor/Koordinator,,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Manufaktur",Asuransi kesehatan;Bisnis (contoh: Kemeja);Pro...,18 days,51 - 200 pekerja,Manufaktur/Produksi,"Tedmond Groups membuka Lowongan Kepala Pabrik,...",10500000.0,1 tahun,1,4,51-200,5
1,Tax Supervisor,Banten,IDR,Supervisor/Koordinator,3 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Akuntansi / Keuangan,Audit & Pajak","Tip;Asuransi kesehatan;Waktu regular, Senin - ...",22 days,51 - 200 pekerja,Konstruksi/Bangunan/Teknik,ResponsibilitiesPrepare corporate income tax (...,8000000.0,3 tahun,3,4,51-200,5
2,Accounting Staff,Jakarta Pusat,IDR,Pegawai (non-manajemen & non-supervisor),1 tahun,"SMA, SMU/SMK/STM, Sertifikat Professional, D3 ...",Penuh Waktu,"Akuntansi / Keuangan,Akuntansi Umum / Pembiayaan",Tip;Asuransi kesehatan;Bisnis (contoh: Kemeja)...,,1- 50 pekerja,Manufaktur/Produksi,DESKRIPSI PEKERJAAN:Memeriksa dan/atau membuat...,4750000.0,1 tahun,1,3,1-50,0
3,Senior Staff Purchasing,Surabaya,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",Asuransi kesehatan;Jam Bekerja yang Panjang;Bi...,29 days,1- 50 pekerja,Elektrikal & Elektronik,Deskripsi Pekerjaan:Mampu melakukan rencana pe...,5250000.0,2 tahun,2,3,1-50,0
4,MARKETING COMMUNICATION & CHANNEL MANAGER,Jakarta Pusat,IDR,Manajer/Asisten Manajer,2 tahun,Tidak terspesifikasi,Penuh Waktu,"Penjualan / Pemasaran,Pemasaran/Pengembangan B...",Asuransi Gigi;Asuransi kesehatan;Parkir;Waktu ...,29 days,51 - 200 pekerja,"Konsultasi (IT, Ilmu Pengetahuan, Teknis & Tek...","SCOPE OF ROLE :To plan, develop, implement and...",15000000.0,2 tahun,2,2,51-200,5


### Encode Employment Type

In [42]:
le = LabelEncoder()
le.fit(dft2.employment_type)
dft2['et'] = le.transform(dft2.employment_type)

le = LabelEncoder()
le.fit(dfp.employment_type)
dfp['et'] = le.transform(dfp.employment_type)

In [43]:
enc = OneHotEncoder()
enc.fit(dft2[['et']])

# Prediction Case
enc = OneHotEncoder()
enc.fit(dfp[['et']])

OneHotEncoder()

In [44]:
new_cols2 = ['et'+ str(i) for i in range(5)]
df_onehot2 = pd.DataFrame(enc.transform(dft2[['et']]).toarray(), columns=new_cols2)
df3 = pd.concat([df2[['f1','f20', 'f21', 'f22', 'f23', 'f24','salary','cs0', 'cs1', 'cs2',
       'cs3', 'cs4', 'cs5', 'cs6']], df_onehot2], axis = 1)
df3.columns

Index(['f1', 'f20', 'f21', 'f22', 'f23', 'f24', 'salary', 'cs0', 'cs1', 'cs2',
       'cs3', 'cs4', 'cs5', 'cs6', 'et0', 'et1', 'et2', 'et3', 'et4'],
      dtype='object')

In [45]:
new_cols2_p = ['et'+ str(i) for i in range(5)]
df_onehot2_p = pd.DataFrame(enc.transform(dfp[['et']]).toarray(), columns=new_cols2_p)
dfp3 = pd.concat([dfp2[['f1','f20', 'f21', 'f22', 'f23', 'f24','cs0', 'cs1', 'cs2', 'cs3',
       'cs4', 'cs5', 'cs6','salary']], df_onehot2_p], axis = 1)
dfp3.columns

Index(['f1', 'f20', 'f21', 'f22', 'f23', 'f24', 'cs0', 'cs1', 'cs2', 'cs3',
       'cs4', 'cs5', 'cs6', 'salary', 'et0', 'et1', 'et2', 'et3', 'et4'],
      dtype='object')

In [46]:
X = df3[['f1']+new_cols+new_cols1+new_cols2]
y = df3['salary']

px = dfp3[['f1']+new_cols_pred+new_cols1_p+new_cols2_p]
py = dfp3['salary']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Predict Case
px_train, px_pred, py_train, py_pred = train_test_split(px, py, test_size=0.25, random_state=42)

In [48]:
model1 = LinearRegression()
model2 = PolynomialFeatures(degree=1)
model3 = PolynomialFeatures(degree=2)
model4 = PolynomialFeatures(degree=3)
model5 = PolynomialFeatures(degree=4)


In [49]:
X_train

Unnamed: 0,f1,f20,f21,f22,f23,f24,cs0,cs1,cs2,cs3,cs4,cs5,cs6,et0,et1,et2,et3,et4
4072,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3426,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1173,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4669,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1561,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5191,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5226,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [50]:
y_train

4072     2875000.0
3426     7500000.0
1173     7000000.0
4669     2600000.0
1561    11250000.0
           ...    
3772     6500000.0
5191     1500000.0
5226     7000000.0
5390     5640000.0
860      5750000.0
Name: salary, Length: 4761, dtype: float64

In [51]:
model1.fit(X_train, y_train)

LinearRegression()

In [52]:
print("Intercept: ", model1.intercept_)
print("coefficients: ")
list(zip(X, model1.coef_))

Intercept:  -7.324321964924532e+18
coefficients: 


[('f1', 1213599.59553213),
 ('f20', -2.0356385276727875e+17),
 ('f21', -2.0356385278136438e+17),
 ('f22', -2.0356385277673152e+17),
 ('f23', -2.0356385278123603e+17),
 ('f24', -2.035638527805292e+17),
 ('cs0', 7.887460461035694e+18),
 ('cs1', 7.887460461035794e+18),
 ('cs2', 7.887460461036169e+18),
 ('cs3', 7.887460461035817e+18),
 ('cs4', 7.887460461035604e+18),
 ('cs5', 7.887460461035796e+18),
 ('cs6', 7.8874604610365e+18),
 ('et0', -3.5957464332734214e+17),
 ('et1', -3.595746433288242e+17),
 ('et2', -3.595746433278889e+17),
 ('et3', -3.595746433264375e+17),
 ('et4', -3.595746433268475e+17)]

In [53]:
predictions = model1.predict(X_test)

In [54]:
print("Prediction for test set: {}".format(predictions))

Prediction for test set: [ 7130112.  6723584. 26043392. ... 14163968. 20348928.  4804608.]


In [55]:
model1_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': predictions})
model1_diff

Unnamed: 0,Actual value,Predicted value
730,7500000.0,7130112.0
2131,7500000.0,6723584.0
3070,18000000.0,26043392.0
6101,5500000.0,5409792.0
217,7200000.0,6623232.0
...,...,...
4329,10000000.0,10831872.0
925,3600000.0,5787648.0
5552,30000000.0,14163968.0
295,12000000.0,20348928.0


In [56]:
# Predict Case
submission = model1.predict(px)
submission.shape

(3000,)

In [57]:
py = dfp3.assign(salary = submission)

In [58]:
dfp['salary'] = py['salary']
dfp.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,...,company_size,company_industry,"job_description,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",salary,experience_level2,f1,f2,company_size2,cs,et
0,31747,Sous Chef,Lombok,IDR,Pegawai (non-manajemen & non-supervisor),8 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Hotel/Restoran,Makanan/Minuman/Pelayanan Restoran",,...,,Makanan & Minuman/Katering/Restoran,Candidate must posses at least bachelor degree...,13197312.0,8 tahun,8,3,1-50,0,3
1,31748,"Bancassurance Officer (Area: Bali, Sulawesi Ut...",Palu,IDR,Pegawai (non-manajemen & non-supervisor),1 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Penjualan / Pemasaran,Penjualan - Jasa Keuangan","Asuransi kesehatan;Waktu regular, Senin - Juma...",...,201 - 500 pekerja,Asuransi,Bancassurance Officer adalah Representative da...,4825088.0,1 tahun,1,3,201-500,3,3
2,31749,Marketing Staff,Jakarta Utara,IDR,Pegawai (non-manajemen & non-supervisor),1 tahun,"SMA, SMU/SMK/STM, Sertifikat Professional, D3 ...",Penuh Waktu,"Penjualan / Pemasaran,Pemasaran/Pengembangan B...",,...,,Manufaktur/Produksi,Bertanggung jawab atas penjualan sesuai dengan...,4702208.0,1 tahun,1,3,1-50,0,3
3,31750,Section Head Commercials,Jakarta Raya,IDR,Manajer/Asisten Manajer,3 tahun,"SMA, SMU/SMK/STM, Sertifikat Professional, D3 ...",Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel",Uniform,...,501 - 1000 pekerja,Retail/Merchandise,Tanggung Jawab :Bertanggung jawab atas keselur...,11544576.0,3 tahun,3,2,501-1000,4,3
4,31751,Social Media HEAD,Aceh,IDR,Supervisor/Koordinator,3 tahun,"SMA, SMU/SMK/STM, Sertifikat Professional, D3 ...",Penuh Waktu,"Penjualan / Pemasaran,Digital Marketing",,...,1- 50 pekerja,Makanan & Minuman/Katering/Restoran,# Memiliki pengalaman di atas# Harus memiliki ...,7836672.0,3 tahun,3,4,1-50,0,3


In [59]:
submissions = dfp[['id','salary']]
submissions

Unnamed: 0,id,salary
0,31747,13197312.0
1,31748,4825088.0
2,31749,4702208.0
3,31750,11544576.0
4,31751,7836672.0
...,...,...
2995,34742,5010432.0
2996,34743,5825536.0
2997,34744,5915648.0
2998,34745,4676608.0


In [60]:
meanAbErr = metrics.mean_absolute_error(y_test, predictions)
meanSqErr = metrics.mean_squared_error(y_test, predictions)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('R squared: {:.2f}'.format(model1.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 45.44
Mean Absolute Error: 2462610.829237555
Mean Square Error: 25392523288954.477
Root Mean Square Error: 5039099.452179375


In [61]:
np.sqrt(mean_squared_error(y_train,model1.predict(X_train)))

NameError: name 'mean_squared_error' is not defined

In [62]:
submissions.to_csv(r'datar_submission4.csv', index=False)

## XGBoost Regression

In [75]:
from sklearn.model_selection import GridSearchCV

# Define a function for each metrics
# R2
def rsqr_score(test, pred):
    r2_ = r2_score(test, pred)
    return r2_

# RSME
def rmse_score(test, pred):
    rmse_ = np.sqrt(mean_squared_error(test, pred))
    return rmse_

# Print the scores
def print_score(test, pred):
    print(f"- Regressor: {regr.__class__.__name__}")
    print(f"R2: {rsqr_score(test, pred)}")
    print(f"RMSE: {rmse_score(test, pred)}\n")

In [70]:
# Define regression models
xgboost = XGBRegressor()

# Train models on X_train and y_train
for regr in [xgboost]:
    # fit the corresponding model
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    # Print the defined metrics above for each classifier
    print_score(y_test, y_pred)

- Regressor: XGBRegressor
RMSE: 4933050.243211907



In [76]:
# Define hyperparameters
tuned_parameters = {"max_depth": [3],
                    "colsample_bytree": [0.3, 0.7],
                    "learning_rate": [0.01, 0.05, 0.1],
                    "n_estimators": [100, 500]}

# GridSearch
xgbr_cv = GridSearchCV(estimator=XGBRegressor(),
                       param_grid=tuned_parameters,
                       cv=5,
                       n_jobs=1,
                       verbose=1)

# fit the GridSearch on train set
xgbr_cv.fit(X_train, y_train)

# print best params and the corresponding 
print(f"Best hyperparameters: {xgbr_cv.best_params_}\n")
print(f"Best R2: {xgbr_cv.best_score_}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best hyperparameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Best R2: 0.49590305279421554


In [82]:
# XGB Regressor  with the best hyperparameters
xgbr_mod = XGBRegressor(seed=20,
                        colsample_bytree = xgbr_cv.best_params_["colsample_bytree"],
                        learning_rate = xgbr_cv.best_params_["learning_rate"],
                        max_depth = xgbr_cv.best_params_["max_depth"],
                        n_estimators = xgbr_cv.best_params_["n_estimators"])

# fit the model on train set
xgbr_mod.fit(X_train, y_train)

# Predict on test set
y_pred = xgbr_mod.predict(X_test)

print(f"- {xgbr_mod.__class__.__name__}")
print(f"R2: {rsqr_score(y_test, y_pred)}")
print(f"RMSE: {rmse_score(y_test, y_pred)}")

- XGBRegressor
R2: 0.5591034575251803
RMSE: 4479082.725901641


In [86]:
# Save the model results into lists
model_list = []
r2_list = []
rmse_list = []

model_list.append(xgbr_mod.__class__.__name__)
r2_list.append(round(rsqr_score(y_test, y_pred), 4))
rmse_list.append(round(rmse_score(y_test, y_pred), 4))

In [103]:
y_pred

array([ 6995699.,  6546750., 26194484., ..., 13503789., 16522033.,
        4927323.], dtype=float32)

In [104]:
xgb_model = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
xgb_model

Unnamed: 0,Actual value,Predicted value
730,7500000.0,6995699.0
2131,7500000.0,6546750.0
3070,18000000.0,26194484.0
6101,5500000.0,5924128.0
217,7200000.0,6546750.0
...,...,...
4329,10000000.0,11128147.0
925,3600000.0,5681698.0
5552,30000000.0,13503789.0
295,12000000.0,16522033.0


In [105]:
model1_diff

Unnamed: 0,Actual value,Predicted value
730,7500000.0,7130112.0
2131,7500000.0,6723584.0
3070,18000000.0,26043392.0
6101,5500000.0,5409792.0
217,7200000.0,6623232.0
...,...,...
4329,10000000.0,10831872.0
925,3600000.0,5787648.0
5552,30000000.0,14163968.0
295,12000000.0,20348928.0


In [102]:
# Plot Actual vs Predicted salary
actual_salary = np.exp(y_test["salary"])
predicted_salary = np.exp(y_pred)

plt.figure()
plt.title("Actual vs Predicted salary", fontsize=20)
plt.scatter(actual_salary, predicted_salary,
            color="deepskyblue", marker="o", facecolors="none")
plt.plot([0, 100000000], [0, 100000000], "darkorange", lw=2)
plt.xlim(0, 100000000)
plt.ylim(0, 100000000)
plt.xlabel("\nActual Salary", fontsize=16)
plt.ylabel("Predicted salary\n", fontsize=16)
plt.show()


KeyError: 'salary'