In [277]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import svm
from bayes_opt import BayesianOptimization
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

In [225]:
#!pip install xgboost
#!pip install bayesian-optimization
#!pip install -U scikit-learn
#!pip install GPyOpt

In [226]:
features = pd.read_csv('features.csv')

In [227]:
features = features.drop('course_id',axis=1)
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10


In [228]:
features[["u_chain"]] *= 1

In [229]:
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10


In [230]:
features.shape

(739074, 19)

## Randomly creating a new dataset with 5000 rows

The value will be same for the selected random_state

In [253]:
features_scaled = features.sample(n=100000,random_state = 2)
features_scaled.shape

(100000, 19)

## Preparing the Data

### forum_id 
    2: General (Miscellaneous) Discussion
    3: Assignments
    4: Study Groups / Meetups
    7: Course Feedback / Suggestions
    8: Lectures
    9: Platform Issues
    100: Signature Track
    otherwise: not remapped

In [254]:
X = features_scaled.drop('forum_id', axis=1)
y = features_scaled['forum_id']

In [255]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
X_train_scaled = scaling.transform(X)
print(X_train_scaled.shape)

(100000, 18)


In [256]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled,y, test_size=0.2,random_state=109) 

In [257]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(80000, 18)
(20000, 18)
(80000,)
(20000,)


## SVM Model

In [258]:
svm_model = svm.SVC(kernel='linear', C = 1.0,cache_size=7000)
svm_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [259]:
y_pred_svm = svm_model.predict(X_test)

###  Creating the Classification report

In [260]:
print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           2       0.34      0.28      0.30      4751
           3       0.38      0.90      0.53      5227
           4       0.30      0.03      0.05      1689
           7       0.00      0.00      0.00       466
           8       0.53      0.05      0.09      2994
           9       0.00      0.00      0.00       505
          10       0.63      0.46      0.53      4368

    accuracy                           0.41     20000
   macro avg       0.31      0.25      0.22     20000
weighted avg       0.42      0.41      0.35     20000



###  Creating the confusion Matrix and checking the accuracy

In [261]:
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

[[1312 2848   61    0   31    0  499]
 [ 245 4721    3    0   36    0  222]
 [1098  456   49    0   18    0   68]
 [  58  352    1    0    1    0   54]
 [ 611 1913   14    0  154    0  302]
 [  32  414    2    0    1    0   56]
 [ 555 1721   34    0   50    0 2008]]


In [262]:
accuracy = cm_svm.diagonal().sum()/cm_svm.sum()
print(accuracy)

0.4122


## XG Boost Model

In [263]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [264]:
y_pred = model.predict(X_test)

###  Creating the Classification report

In [265]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.79      0.78      0.78      4751
           3       0.77      0.89      0.83      5227
           4       0.93      0.90      0.92      1689
           7       0.86      0.39      0.53       466
           8       0.77      0.69      0.73      2994
           9       0.85      0.43      0.57       505
          10       0.88      0.91      0.89      4368

    accuracy                           0.82     20000
   macro avg       0.84      0.71      0.75     20000
weighted avg       0.82      0.82      0.81     20000



###  Creating the confusion Matrix and checking the accuracy

In [266]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[3688  603   51    2  248    9  150]
 [ 258 4665    8    8  172    5  111]
 [  63   17 1523    3   26    2   55]
 [  81   91    7  181   48    7   51]
 [ 355  438    9    7 2057   13  115]
 [  93   74    8    5   56  219   50]
 [ 134  165   29    5   65    2 3968]]


In [267]:
accuracy = cm.diagonal().sum()/cm.sum()
print(accuracy)

0.81505


## Hyper Parameter Tuning on XGBoost

Parameters:

* learning_rate : Makes the model more robust by shrinking the weights on each step. Used to prevent overfitting.
* max_depth  : Determines how deeply each tree is allowed to grow during any boosting round.Range should be between 3-10. For starting, start from lower values.
* min_child_weight : Smaller values are chosen because leaf nodes could have small size groups.
* subsample, colsample_bytree  : ideal range is between 0.5 to 0.9. Low value can lead to underfitting
* n_estimators: number of trees you want to build.
* objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

Regularization Parameters:
* gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits.
* alpha: L1 regularization on leaf weights. A large value leads to more regularization.
* lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

### Bayesian Optimization function for xgboost

In [310]:
def tuning_function(n_estimators ,learning_rate):
     params = {'n_estimators': int(n_estimators),
              'learning_rate':learning_rate,
              'subsample': 0.8,
              'eta': 0.1,
              'eval_metric': 'rmse',
             'max_depth':4,
             'min_child_weight':6,
             'gamma':0,
             'subsample':0.8,
             'colsample_bytree':0.8,
             'reg_alpha':0.005,
             'objective': 'logistic',
             'nthread':4,
             'seed':27,
             'eta':0.1}

In [311]:
bayes = BayesianOptimization(tuning_function, {'learning_rate':(0,0.5),
                                             'n_estimators':(500,1000)
                                            })


In [328]:
#bayes.maximize(n_iter=5, init_points=8, acq='ei')

### Extracting the best parameters

In [313]:
params = bayes.max['params']
params['n_estimators']= int(params['n_estimators'])

In [314]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)

In [315]:
y_pred_HT = hyper_model.predict(X_test)

###  Creating the Classification Report

In [316]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.87      0.88      0.87      4751
           3       0.88      0.93      0.91      5227
           4       0.96      0.93      0.94      1689
           7       0.87      0.61      0.72       466
           8       0.85      0.82      0.84      2994
           9       0.89      0.62      0.73       505
          10       0.93      0.95      0.94      4368

    accuracy                           0.89     20000
   macro avg       0.89      0.82      0.85     20000
weighted avg       0.89      0.89      0.89     20000



###  Creating the confusion Matrix and checking the accuracy

In [317]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[4198  240   23    5  174    9  102]
 [ 177 4859   10   10  110    9   52]
 [  53    8 1563    3   23    1   38]
 [  54   47    5  283   39    3   35]
 [ 217  230    9   14 2459    8   57]
 [  66   38    7    3   43  312   36]
 [  82   73   19    7   44    7 4136]]


In [318]:
accuracy = cm_hp.diagonal().sum()/cm.sum()
print(accuracy)

0.8905


#### Model is fitted for 5000 rows only
#### From the XG Boost(Acc = 0.81) and the hyper tunes XGBoost(Acc = 0.89) accuracies, we can see that the accuracy of the model has increased.

#### The Basic SVM Model has lowest accuracy which is 0.41 and we can see that there is a significant increase in accuracy once we use XG BOOST.

#### We can see that there is a 40% increase in accuracy from the SVM model to the XGBoost model and around 48% increase in accuracy reaching 89% for hyper tuned XGBoost model

## Fitting the XGBOOST and XGBOOST fine tuned model with original dataset.

In [319]:
X = features.drop('forum_id', axis=1)
y = features['forum_id']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=109)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(591259, 18)
(147815, 18)
(591259,)
(147815,)


### XGBOOST

In [320]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [321]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.81      0.79      0.80     34537
           3       0.79      0.91      0.85     38655
           4       0.96      0.92      0.94     12593
           7       0.91      0.44      0.60      3557
           8       0.78      0.73      0.75     22237
           9       0.87      0.48      0.62      3670
          10       0.90      0.91      0.91     32566

    accuracy                           0.83    147815
   macro avg       0.86      0.74      0.78    147815
weighted avg       0.84      0.83      0.83    147815



In [322]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[27261  4139   237    27  1776    89  1008]
 [ 1544 35186    41    32  1112    35   705]
 [  386   111 11644     2   160     3   287]
 [  579   581    18  1574   445    42   318]
 [ 2376  2836    50    48 16166    73   688]
 [  604   572    36    19   317  1758   364]
 [  996  1026   153    22   627    14 29728]]


In [323]:
accuracy = cm.diagonal().sum()/cm.sum()
print(accuracy)

0.8342658052295099


### Bayesian Hypertuned XGBOOST model

In [324]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)
y_pred_HT = hyper_model.predict(X_test)

In [325]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.95      0.96      0.96     34537
           3       0.96      0.98      0.97     38655
           4       0.99      0.98      0.99     12593
           7       0.97      0.90      0.93      3557
           8       0.96      0.94      0.95     22237
           9       0.97      0.89      0.93      3670
          10       0.98      0.98      0.98     32566

    accuracy                           0.96    147815
   macro avg       0.97      0.95      0.96    147815
weighted avg       0.96      0.96      0.96    147815



In [330]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[33052   826    21    29   392    36   181]
 [  401 37761    11    22   309    17   134]
 [   87    15 12390     1    30     5    65]
 [  145    81     6  3192    61    10    62]
 [  602   535     7    22 20882    13   176]
 [  136    89     2    22    57  3281    83]
 [  246   201    30    13   102    13 31961]]


In [331]:
accuracy = cm_hp.diagonal().sum()/cm_hp.sum()
print(accuracy)

0.9641714305043466


## Implemented XGBoost on and Hyper tuned model on the full dataset.

##### We can see that there is considerable increase in the Accuracy as compared to the fitted models with only 5000 rows in the dataset

##### Also, the hypertuned model has accuracy 0.83 while the normal XGBoost model has accuracy 0.96 and this shows a 13% increase in accuracy just by hypertuning