In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import svm
from bayes_opt import BayesianOptimization
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

In [2]:
#!pip install xgboost
#!pip install bayesian-optimization
#!pip install -U scikit-learn
#!pip install GPyOpt

In [3]:
features = pd.read_csv('features.csv')

In [4]:
features = features.drop('course_id',axis=1)
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10


In [5]:
features[["u_chain"]] *= 1

In [6]:
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10


In [7]:
features.shape

(739074, 19)

## Randomly creating a new dataset with 5000 rows

The value will be same for the selected random_state

In [74]:
features_scaled = features.sample(n=100000,random_state = 3)
features_scaled.shape

(100000, 19)

## Preparing the Data

### forum_id 
    2: General (Miscellaneous) Discussion
    3: Assignments
    4: Study Groups / Meetups
    7: Course Feedback / Suggestions
    8: Lectures
    9: Platform Issues
    100: Signature Track
    otherwise: not remapped

In [75]:
X = features_scaled.drop('forum_id', axis=1)
y = features_scaled['forum_id']

In [76]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
X_train_scaled = scaling.transform(X)
print(X_train_scaled.shape)

(100000, 18)


In [77]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled,y, test_size=0.2,random_state=109) 

In [78]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(80000, 18)
(20000, 18)
(80000,)
(20000,)


## SVM Model

In [13]:
svm_model = svm.SVC(kernel='linear', C = 1.0,probability = True,cache_size=7000)
svm_model.fit(X_train, y_train)

SVC(cache_size=7000, kernel='linear', probability=True)

In [14]:
y_pred_svm = svm_model.predict(X_test)

###  Creating the Classification report

In [15]:
print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           2       0.35      0.28      0.31      4657
           3       0.38      0.90      0.54      5218
           4       0.39      0.03      0.06      1690
           7       0.00      0.00      0.00       525
           8       0.49      0.08      0.14      3011
           9       0.00      0.00      0.00       504
          10       0.61      0.47      0.53      4395

    accuracy                           0.42     20000
   macro avg       0.32      0.25      0.23     20000
weighted avg       0.42      0.42      0.36     20000



  _warn_prf(average, modifier, msg_start, len(result))


###  Creating the confusion Matrix and checking the accuracy

In [16]:
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

[[1322 2720   55    0   53    0  507]
 [ 234 4674    0    0   63    0  247]
 [1056  457   59    0   54    0   64]
 [  60  384    0    0    6    0   75]
 [ 508 1885   11    0  245    0  362]
 [  36  407    0    0    4    0   57]
 [ 590 1644   27    0   73    0 2061]]


In [17]:
pred_prob = svm_model.predict_proba(X_test)

In [18]:
auc_score = roc_auc_score(y_test, pred_prob, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 0.71


In [19]:
accuracy = cm_svm.diagonal().sum()/cm_svm.sum()
print('Accuracy = %.2f'% accuracy)

Accuracy = 0.42


## XG Boost Model

In [20]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
y_pred = model.predict(X_test)

###  Creating the Classification report

In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.78      0.78      0.78      4657
           3       0.78      0.90      0.83      5218
           4       0.93      0.90      0.92      1690
           7       0.81      0.37      0.51       525
           8       0.76      0.70      0.73      3011
           9       0.82      0.40      0.54       504
          10       0.89      0.90      0.89      4395

    accuracy                           0.81     20000
   macro avg       0.82      0.71      0.74     20000
weighted avg       0.82      0.81      0.81     20000



###  Creating the confusion Matrix and checking the accuracy

In [23]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[3628  557   52    8  268   16  128]
 [ 241 4686    7   11  166    5  102]
 [  67   14 1525    2   32    5   45]
 [  96  104    6  196   60    3   60]
 [ 350  410   14   10 2104   11  112]
 [  93   90   11    7   51  204   48]
 [ 156  158   23    9   89    5 3955]]


In [24]:
pred_prob_xg = model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob_xg, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 0.96


In [25]:
accuracy = cm.diagonal().sum()/cm.sum()
print('Accuracy = %.2f'% accuracy)

Accuracy = 0.81


## Hyper Parameter Tuning on XGBoost

Parameters:

* learning_rate : Makes the model more robust by shrinking the weights on each step. Used to prevent overfitting.
* max_depth  : Determines how deeply each tree is allowed to grow during any boosting round.Range should be between 3-10. For starting, start from lower values.
* min_child_weight : Smaller values are chosen because leaf nodes could have small size groups.
* subsample, colsample_bytree  : ideal range is between 0.5 to 0.9. Low value can lead to underfitting
* n_estimators: number of trees you want to build.
* objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

Regularization Parameters:
* gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits.
* alpha: L1 regularization on leaf weights. A large value leads to more regularization.
* lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

### Bayesian Optimization function for xgboost

In [79]:
def tuning_function(n_estimators ,learning_rate):
     params = {'n_estimators': int(n_estimators),
              'learning_rate':learning_rate,
              'subsample': 0.8,
              'eta': 0.1,
              'eval_metric': 'rmse',
             'max_depth':int(4),
             'min_child_weight':6,
             'gamma':0,
             'subsample':0.8,
             'colsample_bytree':0.8,
             'reg_alpha':0.005,
             'objective': 'logistic',
             'nthread':4,
             'seed':27,
             'eta':0.1}

In [80]:
bayes = BayesianOptimization(tuning_function, {'n_estimators':(500,1000),
                                            'learning_rate':(0,0.5)
                                            })


In [89]:
#bayes.maximize(n_iter=5)

### Extracting the best parameters

In [82]:
params = bayes.max['params']
params['n_estimators']= int(params['n_estimators'])

In [83]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)

In [84]:
y_pred_HT = hyper_model.predict(X_test)

###  Creating the Classification Report

In [85]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.86      0.89      0.87      4657
           3       0.89      0.93      0.91      5218
           4       0.96      0.94      0.95      1690
           7       0.86      0.59      0.70       525
           8       0.85      0.84      0.85      3011
           9       0.87      0.60      0.71       504
          10       0.93      0.94      0.93      4395

    accuracy                           0.89     20000
   macro avg       0.89      0.82      0.85     20000
weighted avg       0.89      0.89      0.89     20000



###  Creating the confusion Matrix and checking the accuracy

In [86]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[4156  212   27    7  167   11   77]
 [ 178 4848    3    8  119   11   51]
 [  39    5 1591    2   22    7   24]
 [  75   53    1  310   46    5   35]
 [ 232  163    6   11 2522    7   70]
 [  74   50    4    9   30  303   34]
 [ 105   87   24   14   48    4 4113]]


In [87]:
pred_prob_hyper = hyper_model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob_hyper, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 0.98


In [88]:
accuracy = cm_hp.diagonal().sum()/cm_hp.sum()
print('Accuracy = %.2f'% accuracy)

Accuracy = 0.89


#### Model is fitted for 5000 rows only
#### From the XG Boost(Acc = 0.81) and the hyper tunes XGBoost(Acc = 0.89) accuracies, we can see that the accuracy of the model has increased.

#### The Basic SVM Model has lowest accuracy which is 0.41 and we can see that there is a significant increase in accuracy once we use XG BOOST.

#### We can see that there is a 40% increase in accuracy from the SVM model to the XGBoost model and around 48% increase in accuracy reaching 89% for hyper tuned XGBoost model

## Fitting the XGBOOST and XGBOOST fine tuned model with original dataset.

In [37]:
X = features.drop('forum_id', axis=1)
y = features['forum_id']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=109)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(591259, 18)
(147815, 18)
(591259,)
(147815,)


### XGBOOST

In [38]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [39]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.81      0.79      0.80     34537
           3       0.79      0.91      0.85     38655
           4       0.96      0.92      0.94     12593
           7       0.91      0.44      0.60      3557
           8       0.78      0.73      0.75     22237
           9       0.87      0.48      0.62      3670
          10       0.90      0.91      0.91     32566

    accuracy                           0.83    147815
   macro avg       0.86      0.74      0.78    147815
weighted avg       0.84      0.83      0.83    147815



In [40]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[27261  4139   237    27  1776    89  1008]
 [ 1544 35186    41    32  1112    35   705]
 [  386   111 11644     2   160     3   287]
 [  579   581    18  1574   445    42   318]
 [ 2376  2836    50    48 16166    73   688]
 [  604   572    36    19   317  1758   364]
 [  996  1026   153    22   627    14 29728]]


In [41]:
pred_prob_xg = model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob_xg, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 0.97


In [42]:
accuracy = cm.diagonal().sum()/cm.sum()
print('Accuracy = %.2f'% accuracy)

Accuracy = 0.83


### Bayesian Hypertuned XGBOOST model

In [69]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)
y_pred_HT = hyper_model.predict(X_test)

In [70]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.96      0.96      0.96     34537
           3       0.96      0.98      0.97     38655
           4       0.99      0.98      0.99     12593
           7       0.98      0.91      0.94      3557
           8       0.96      0.95      0.96     22237
           9       0.97      0.90      0.94      3670
          10       0.98      0.98      0.98     32566

    accuracy                           0.97    147815
   macro avg       0.97      0.95      0.96    147815
weighted avg       0.97      0.97      0.97    147815



In [71]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[33198   700    20    18   378    44   179]
 [  372 37930    13    15   204    14   107]
 [   82    14 12395     2    28     3    69]
 [  117    71     4  3250    48     8    59]
 [  521   450     6    17 21094     9   140]
 [  133    80     6    17    53  3315    66]
 [  226   151    27     7   109     9 32037]]


In [72]:
pred_prob_hyper = hyper_model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob_hyper, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 1.00


In [73]:
accuracy = cm_hp.diagonal().sum()/cm_hp.sum()
print('Accuracy = %.2f'% accuracy)

Accuracy = 0.97


## Implemented XGBoost on and Hyper tuned model on the full dataset.

##### We can see that there is considerable increase in the Accuracy as compared to the fitted models with only 100k rows in the dataset

##### Also, the hypertuned model has accuracy 0.83 while the normal XGBoost model has accuracy 0.97 and this shows a 14% increase in accuracy just by hypertuning

# Conclusion

From the below table, we can see that the hypertuned model of the full dataset has the highest accuracy of 97% and the roc-auc score is 1 which is very good. Hence, it is the best model.

<table>
  <thead>
    <tr>
      <th>Model</th>
      <th>Accuracy</th>
      <th>AUC-ROC</th>
      <th>Average Precision</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>SVM Model</td>
      <td>42%</td>
      <td>0.71</td>
      <td>0.32</td>
    </tr>
    <tr>
      <td>XGBOOST Model(small Dataset)</td>
      <td>81%</td>
      <td>0.96</td>
      <td>0.82</td>
    </tr><tr>
      <td>XGBOOST Hyper tuned Model(small Dataset)</td>
      <td>89%</td>
      <td>0.98</td>
      <td>0.89</td>
    </tr><tr>
      <td>XGBOOST Model</td>
      <td>83%</td>
      <td>0.97</td>
      <td>0.86</td>
    </tr><tr>
      <td>XGBOOST Hyper tuned Model</td>
      <td>97%</td>
      <td>1.00</td>
      <td>0.97</td>
    </tr>
  </tbody>
</table>