In [143]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import svm
from bayes_opt import BayesianOptimization
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [130]:
#!pip install xgboost
#!pip install bayesian-optimization
#!pip install -U scikit-learn
#!pip install GPyOpt

In [103]:
features = pd.read_csv('features.csv')

In [104]:
features = features.drop('course_id',axis=1)
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10


In [105]:
features[["u_chain"]] *= 1

In [106]:
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10


In [107]:
features.shape

(739074, 19)

## Randomly creating a new dataset with 5000 rows

The value will be same for the selected random_state

In [108]:
features_scaled = features.sample(n=5000,random_state = 2)
features_scaled.shape

(5000, 19)

## Preparing the Data

### forum_id 
    2: General (Miscellaneous) Discussion
    3: Assignments
    4: Study Groups / Meetups
    7: Course Feedback / Suggestions
    8: Lectures
    9: Platform Issues
    100: Signature Track
    otherwise: not remapped

In [109]:
X = features_scaled.drop('forum_id', axis=1)
y = features_scaled['forum_id']

In [110]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
X_train_scaled = scaling.transform(X)
print(X_train_scaled.shape)

(5000, 18)


In [111]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled,y, test_size=0.2,random_state=109) 

In [112]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4000, 18)
(1000, 18)
(4000,)
(1000,)


## SVM Model

In [113]:
svm_model = svm.SVC(kernel='linear', C = 1.0,cache_size=7000)
svm_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [114]:
y_pred_svm = svm_model.predict(X_test)

###  Creating the Classification report

In [115]:
print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           2       0.31      0.24      0.27       221
           3       0.38      0.91      0.53       268
           4       1.00      0.01      0.02        89
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00       157
           9       0.00      0.00      0.00        32
          10       0.58      0.50      0.54       216

    accuracy                           0.41      1000
   macro avg       0.32      0.24      0.20      1000
weighted avg       0.39      0.41      0.32      1000



  'precision', 'predicted', average, warn_for)


###  Creating the confusion Matrix and checking the accuracy

In [116]:
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

[[ 52 141   0   0   0   0  28]
 [ 10 244   0   0   0   0  14]
 [ 45  38   1   0   0   0   5]
 [  1  14   0   0   0   0   2]
 [ 33  96   0   0   0   0  28]
 [  0  31   0   0   0   0   1]
 [ 26  81   0   0   0   0 109]]


In [117]:
accuracy = cm_svm.diagonal().sum()/cm_svm.sum()
print(accuracy)

0.406


## XG Boost Model

In [118]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [119]:
y_pred = model.predict(X_test)

###  Creating the Classification report

In [120]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.62      0.67      0.65       221
           3       0.73      0.79      0.76       268
           4       0.77      0.76      0.77        89
           7       0.33      0.06      0.10        17
           8       0.63      0.55      0.59       157
           9       0.50      0.16      0.24        32
          10       0.78      0.82      0.80       216

    accuracy                           0.70      1000
   macro avg       0.62      0.55      0.56      1000
weighted avg       0.69      0.70      0.69      1000



###  Creating the confusion Matrix and checking the accuracy

In [121]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[149  33   6   2  14   0  17]
 [ 27 211   1   0  16   0  13]
 [ 13   1  68   0   3   0   4]
 [  4   3   1   1   4   2   2]
 [ 26  26   3   0  87   3  12]
 [ 12   6   1   0   5   5   3]
 [ 10  10   8   0  10   0 178]]


In [122]:
accuracy = cm.diagonal().sum()/cm.sum()
print(accuracy)

0.699


## Hyper Parameter Tuning on XGBoost

Parameters:

* learning_rate : Makes the model more robust by shrinking the weights on each step. Used to prevent overfitting.
* max_depth  : Determines how deeply each tree is allowed to grow during any boosting round.Range should be between 3-10. For starting, start from lower values.
* min_child_weight : Smaller values are chosen because leaf nodes could have small size groups.
* subsample, colsample_bytree  : ideal range is between 0.5 to 0.9. Low value can lead to underfitting
* n_estimators: number of trees you want to build.
* objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

Regularization Parameters:
* gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits.
* alpha: L1 regularization on leaf weights. A large value leads to more regularization.
* lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

### Bayesian Optimization function for xgboost

In [185]:
def tuning_function(n_estimators ,learning_rate):
     params = {'n_estimators': int(n_estimators),
              'learning_rate':learning_rate,
              'subsample': 0.8,
              'eta': 0.1,
              'eval_metric': 'rmse',
             'max_depth':4,
             'min_child_weight':6,
             'gamma':0,
             'subsample':0.8,
             'colsample_bytree':0.8,
             'reg_alpha':0.005,
             'objective': 'logistic',
             'nthread':4,
             'seed':27,
             'eta':0.1}

In [186]:
bayes = BayesianOptimization(tuning_function, {'learning_rate':(0,1),
                                             'n_estimators':(100,120)
                                            })


In [199]:
#bayes.maximize(n_iter=5, init_points=8, acq='ei')

### Extracting the best parameters

In [200]:
params = bayes.max['params']
params['n_estimators']= int(params['n_estimators'])

In [201]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)

In [202]:
y_pred_HT = hyper_model.predict(X_test)

###  Creating the Classification Report

In [203]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.62      0.64      0.63       221
           3       0.72      0.80      0.76       268
           4       0.79      0.75      0.77        89
           7       0.33      0.06      0.10        17
           8       0.63      0.59      0.61       157
           9       0.44      0.12      0.20        32
          10       0.78      0.83      0.81       216

    accuracy                           0.70      1000
   macro avg       0.62      0.54      0.55      1000
weighted avg       0.69      0.70      0.69      1000



###  Creating the confusion Matrix and checking the accuracy

In [204]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[141  39   7   1  16   1  16]
 [ 19 215   0   0  17   2  15]
 [ 12   2  67   0   4   0   4]
 [  6   2   0   1   4   1   3]
 [ 29  22   2   1  92   1  10]
 [ 12   5   1   0   7   4   3]
 [  8  13   8   0   7   0 180]]


In [205]:
accuracy = cm_hp.diagonal().sum()/cm.sum()
print(accuracy)

0.7


#### Model is fitted for 5000 rows only
#### From the XG Boost(Acc = 0.699) and the hyper tunes XGBoost(Acc = 0.7) accuracies, we can see that the accuracy of the model has increased.

#### The Basic SVM Model has lowest accuracy which is 0.406 and we can see that there is a significant increase in accuracy once we use XG BOOST

## Fitting the XGBOOST and XGBOOST fine tuned model with original dataset.

In [215]:
X = features.drop('forum_id', axis=1)
y = features['forum_id']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=109)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(591259, 18)
(147815, 18)
(591259,)
(147815,)


### XGBOOST

In [216]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [217]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.81      0.79      0.80     34537
           3       0.79      0.91      0.85     38655
           4       0.96      0.92      0.94     12593
           7       0.91      0.44      0.60      3557
           8       0.78      0.73      0.75     22237
           9       0.87      0.48      0.62      3670
          10       0.90      0.91      0.91     32566

    accuracy                           0.83    147815
   macro avg       0.86      0.74      0.78    147815
weighted avg       0.84      0.83      0.83    147815



In [218]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[27261  4139   237    27  1776    89  1008]
 [ 1544 35186    41    32  1112    35   705]
 [  386   111 11644     2   160     3   287]
 [  579   581    18  1574   445    42   318]
 [ 2376  2836    50    48 16166    73   688]
 [  604   572    36    19   317  1758   364]
 [  996  1026   153    22   627    14 29728]]


In [219]:
accuracy = cm.diagonal().sum()/cm.sum()
print(accuracy)

0.8342658052295099


### Bayesian Hypertuned XGBOOST model

In [220]:
hyper_model = xgb.XGBClassifier(**params).fit(X_train, y_train)
y_pred_HT = hyper_model.predict(X_test)

In [221]:
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.84      0.82      0.83     34537
           3       0.82      0.92      0.87     38655
           4       0.97      0.94      0.95     12593
           7       0.94      0.55      0.70      3557
           8       0.81      0.76      0.79     22237
           9       0.90      0.56      0.69      3670
          10       0.91      0.93      0.92     32566

    accuracy                           0.86    147815
   macro avg       0.89      0.78      0.82    147815
weighted avg       0.86      0.86      0.86    147815



In [222]:
cm_hp = confusion_matrix(y_test, y_pred_HT)
print(cm_hp)

[[28471  3444   160    19  1593    80   770]
 [ 1311 35620    39    16  1009    44   616]
 [  302    95 11812     6   127     6   245]
 [  466   452     8  1969   368    31   263]
 [ 2071  2440    46    44 16976    44   616]
 [  501   473    39    17   290  2045   305]
 [  814   830   118    27   489    15 30273]]


In [223]:
accuracy = cm_hp.diagonal().sum()/cm.sum()
print(accuracy)

0.8603051111186281


## Implemented XGBoost on and Hyper tuned model on the full dataset.

##### We can see that there is considerable increase in the Accuracy as compared to the fitted models with only 5000 rows in the dataset

##### Also, the hypertuned model has accuracy 0.86 while the normal XGBoost model has accuracy 0.83 and this shows a 3% increase in accuracy just by hypertuning