In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

In [2]:
import os
from xgboost import XGBClassifier
%matplotlib inline

In [3]:
os.chdir('C:\\PGA16')

In [4]:
df=pd.read_csv('admit.csv')

In [5]:
df.columns

Index(['GRE', 'TOEFL', 'Univ_Rating', 'SOP', 'LOR ', 'CGPA', 'Research',
       'Admit'],
      dtype='object')

In [6]:
X=df[['GRE', 'TOEFL', 'Univ_Rating', 'SOP', 'LOR ', 'CGPA', 'Research']]
y=pd.DataFrame(df.iloc[:,-1])

In [7]:
y

Unnamed: 0,Admit
0,1
1,1
2,1
3,1
4,0
...,...
395,1
396,1
397,1
398,0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
from warnings import filterwarnings
filterwarnings('ignore')
model=XGBClassifier()
model.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
from xgboost import plot_tree

In [12]:
%matplotlib qt
plot_tree(model)

<AxesSubplot:>

In [13]:
for i in range(4):
    plot_tree(model,num_trees=1,rankdir='LR')

In [14]:
print(model.feature_importances_)

[0.27947658 0.07633766 0.06690399 0.06933893 0.13369416 0.21310624
 0.16114244]


In [15]:
importance=pd.DataFrame()
importance['features']=X.columns
importance['importance_values']=model.feature_importances_
importance

Unnamed: 0,features,importance_values
0,GRE,0.279477
1,TOEFL,0.076338
2,Univ_Rating,0.066904
3,SOP,0.069339
4,LOR,0.133694
5,CGPA,0.213106
6,Research,0.161142


In [16]:
plt.bar(range(len(model.feature_importances_)),model.feature_importances_)

<BarContainer object of 7 artists>

In [17]:
from xgboost import plot_importance

In [18]:
plot_importance(model)

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>

In [19]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [20]:
kfold= KFold(n_splits=10)
skfold=StratifiedKFold(n_splits=10)
results=cross_val_score(model,X_train,y_train,cv=kfold)



In [21]:
results

array([0.67857143, 0.85714286, 0.82142857, 0.78571429, 0.82142857,
       0.75      , 0.78571429, 0.75      , 0.78571429, 0.82142857])

In [22]:
print("Mean accuray :%2f%% ::: standard Deviation of Score(%2f)" % (results.mean()*100,results.std()))

Mean accuray :78.571429% ::: standard Deviation of Score(0.047916)


In [23]:
results=cross_val_score(model,X_train,y_train,cv=skfold)
print("Mean accuray :%2f%% ::: standard Deviation of Score(%2f)" % (results.mean()*100,results.std()))

Mean accuray :78.928571% ::: standard Deviation of Score(0.062780)


In [24]:
eval_set_data=[(X_test,y_test)]
model.fit(X_train,y_train,eval_metric='error',eval_set=eval_set_data,verbose=True)
results=model.evals_result()

[0]	validation_0-error:0.20000
[1]	validation_0-error:0.17500
[2]	validation_0-error:0.20000
[3]	validation_0-error:0.18333
[4]	validation_0-error:0.19167
[5]	validation_0-error:0.18333
[6]	validation_0-error:0.17500
[7]	validation_0-error:0.18333
[8]	validation_0-error:0.19167
[9]	validation_0-error:0.19167
[10]	validation_0-error:0.18333
[11]	validation_0-error:0.18333
[12]	validation_0-error:0.20000
[13]	validation_0-error:0.17500
[14]	validation_0-error:0.19167
[15]	validation_0-error:0.20000
[16]	validation_0-error:0.20000
[17]	validation_0-error:0.20000
[18]	validation_0-error:0.20000
[19]	validation_0-error:0.20000
[20]	validation_0-error:0.20000
[21]	validation_0-error:0.20000
[22]	validation_0-error:0.20000
[23]	validation_0-error:0.20000
[24]	validation_0-error:0.20000
[25]	validation_0-error:0.19167
[26]	validation_0-error:0.20000
[27]	validation_0-error:0.20000
[28]	validation_0-error:0.20000
[29]	validation_0-error:0.20000
[30]	validation_0-error:0.19167
[31]	validation_0-

In [25]:
eval_set_data = [(X_train,y_train),(X_test,y_test)]
model.fit(X_train, y_train, eval_metric=['error','logloss'], eval_set=eval_set_data, verbose=False)
results = model.evals_result()

In [26]:
epochs=len(results['validation_0']['error'])
x_axis=range(0,epochs)
fig,ax=plt.subplots()
ax.plot(x_axis,results['validation_0']['logloss'],label='Train')
ax.plot(x_axis,results['validation_1']['logloss'],label='Test')
ax.legend()
plt.ylabel('Logloss')
plt.title('XGBoost Log Loss')
plt.show()

In [27]:
model = XGBClassifier(learning_rate=0.001, n_estimators=10,subsample=0.80,max_delta_step=3, colsample_bylevel=0.7)
eval_set_data = [(X_train,y_train),(X_test,y_test)]
model.fit(X_train, y_train, eval_metric=['error','logloss'],early_stopping_rounds=1, eval_set=eval_set_data, verbose=True)
results = model.evals_result()

[0]	validation_0-error:0.13214	validation_0-logloss:0.69256	validation_1-error:0.22500	validation_1-logloss:0.69272
[1]	validation_0-error:0.11429	validation_0-logloss:0.69198	validation_1-error:0.20000	validation_1-logloss:0.69224
[2]	validation_0-error:0.10714	validation_0-logloss:0.69138	validation_1-error:0.15833	validation_1-logloss:0.69169
[3]	validation_0-error:0.10714	validation_0-logloss:0.69080	validation_1-error:0.18333	validation_1-logloss:0.69118
[4]	validation_0-error:0.11071	validation_0-logloss:0.69018	validation_1-error:0.15833	validation_1-logloss:0.69071
[5]	validation_0-error:0.10000	validation_0-logloss:0.68963	validation_1-error:0.16667	validation_1-logloss:0.69020
[6]	validation_0-error:0.10000	validation_0-logloss:0.68905	validation_1-error:0.15000	validation_1-logloss:0.68972
[7]	validation_0-error:0.09643	validation_0-logloss:0.68843	validation_1-error:0.15000	validation_1-logloss:0.68922
[8]	validation_0-error:0.10000	validation_0-logloss:0.68782	validation_1

In [28]:
epochs = len(results['validation_0']['error'])
x_axis = range(0,epochs)
fig, ax = plt.subplots()
ax.plot(x_axis,results['validation_0']['logloss'], label = 'Train')
ax.plot(x_axis,results['validation_1']['logloss'], label = 'Test')
ax.legend()
plt.ylabel('Logloss')
plt.title('XGBoost Log Loss')
plt.show()

In [29]:
model=XGBClassifier()
n_estimators = range(50,400,50)
param_grid = dict(n_estimators=n_estimators)
kfold = KFold(n_splits=10,shuffle=True)
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train,y_train)



In [30]:
means = grid_result.cv_results_['mean_test_score']
stds  = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, std, param in zip (means, stds, params):
    print('%f (%f) with: %r'%(mean, std, param))

-0.590740 (0.241532) with: {'n_estimators': 50}
-0.686298 (0.293130) with: {'n_estimators': 100}
-0.741440 (0.326429) with: {'n_estimators': 150}
-0.774584 (0.346046) with: {'n_estimators': 200}
-0.801782 (0.361104) with: {'n_estimators': 250}
-0.826574 (0.369537) with: {'n_estimators': 300}
-0.847512 (0.382781) with: {'n_estimators': 350}


In [31]:
max_depth = range(1,11,2)
param_grid = dict(max_depth=max_depth)
kfold = KFold(n_splits=10,shuffle=True)
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train,y_train)
means = grid_result.cv_results_['mean_test_score']
stds  = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, std, param in zip (means, stds, params):
    print('%f (%f) with: %r'%(mean, std, param))


-0.401481 (0.137436) with: {'max_depth': 1}
-0.618417 (0.234244) with: {'max_depth': 3}
-0.676898 (0.253055) with: {'max_depth': 5}
-0.673363 (0.249134) with: {'max_depth': 7}
-0.666094 (0.246328) with: {'max_depth': 9}


In [32]:
plt.errorbar(max_depth, means, yerr=stds)
plt.title('depth vs log loss')
plt.xlabel('depth')
plt.ylabel('logloss')
plt.savefig('estimator.png')
plt.show()

In [33]:
pred_train=grid_result.predict(X_train)
pred_test=grid_result.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(y_train,pred_train)

0.875

In [36]:
accuracy_score(y_test,pred_test)

0.85

In [37]:
np.linspace(0.0001,0.001,retstep=True)

(array([0.0001    , 0.00011837, 0.00013673, 0.0001551 , 0.00017347,
        0.00019184, 0.0002102 , 0.00022857, 0.00024694, 0.00026531,
        0.00028367, 0.00030204, 0.00032041, 0.00033878, 0.00035714,
        0.00037551, 0.00039388, 0.00041224, 0.00043061, 0.00044898,
        0.00046735, 0.00048571, 0.00050408, 0.00052245, 0.00054082,
        0.00055918, 0.00057755, 0.00059592, 0.00061429, 0.00063265,
        0.00065102, 0.00066939, 0.00068776, 0.00070612, 0.00072449,
        0.00074286, 0.00076122, 0.00077959, 0.00079796, 0.00081633,
        0.00083469, 0.00085306, 0.00087143, 0.0008898 , 0.00090816,
        0.00092653, 0.0009449 , 0.00096327, 0.00098163, 0.001     ]),
 1.836734693877551e-05)