In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
#!pip install xgboost



In [19]:
features = pd.read_csv('features.csv')

In [20]:
features = features.drop('course_id',axis=1)
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10


In [21]:
features[["u_chain"]] *= 1

In [22]:
features[:5]

Unnamed: 0,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,2,4,0,10
1,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
2,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,1,1,0,10
3,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10
4,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,1,4,277,0,0,0,10


## Preparing the Data

In [23]:
X = features.drop('forum_id', axis=1)
y = features['forum_id']

In [24]:
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=109) 

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(591259, 18)
(147815, 18)
(591259,)
(147815,)


## SVM Model

In [26]:
'''from sklearn import svm
clf = svm.SVC(kernel='linear', C = 1.0)
clf.fit(X_train, y_train)'''

"from sklearn import svm\nclf = svm.SVC(kernel='linear', C = 1.0)\nclf.fit(X_train, y_train)"

In [27]:
'''from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
'''

"from sklearn.svm import SVC\nsvclassifier = SVC(kernel='linear')\nsvclassifier.fit(X_train, y_train)\n"

## XG Boost Model

In [28]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
y_pred = model.predict(X_test)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.81      0.79      0.80     34537
           3       0.79      0.91      0.85     38655
           4       0.96      0.92      0.94     12593
           7       0.91      0.44      0.60      3557
           8       0.78      0.73      0.75     22237
           9       0.87      0.48      0.62      3670
          10       0.90      0.91      0.91     32566

    accuracy                           0.83    147815
   macro avg       0.86      0.74      0.78    147815
weighted avg       0.84      0.83      0.83    147815



## Hyper Parameter Tuning on XGBoost

Parameters:

* learning_rate : Makes the model more robust by shrinking the weights on each step. Used to prevent overfitting.
* max_depth  : Determines how deeply each tree is allowed to grow during any boosting round.Range should be between 3-10. For starting, start from lower values.
* min_child_weight : Smaller values are chosen because leaf nodes could have small size groups.
* subsample, colsample_bytree  : ideal range is between 0.5 to 0.9. Low value can lead to underfitting
* n_estimators: number of trees you want to build.
* objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

Regularization Parameters:
* gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits.
* alpha: L1 regularization on leaf weights. A large value leads to more regularization.
* lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

In [31]:
hyper_model = xgb.XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 objective= 'logistic',
 nthread=4,
 seed=27)
hyper_model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=27, reg_alpha=0.005,
              reg_lambda=1, scale_pos_weight=None, seed=27, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
y_pred_HT = hyper_model.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_HT))

              precision    recall  f1-score   support

           2       0.80      0.79      0.80     34537
           3       0.79      0.91      0.84     38655
           4       0.95      0.92      0.94     12593
           7       0.92      0.43      0.59      3557
           8       0.78      0.72      0.75     22237
           9       0.84      0.45      0.59      3670
          10       0.90      0.91      0.91     32566

    accuracy                           0.83    147815
   macro avg       0.86      0.73      0.77    147815
weighted avg       0.83      0.83      0.83    147815

