In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer

In [2]:
### Data Pre-processing ###
data = pd.read_csv('random_f_data.csv').dropna().reindex() # there are five rows with na value on the text column, drop it out.
text = data['expDescription'] # get the text data
data.drop(['Unnamed: 0', 'ownerID','school', 'expDescription'],axis=1,inplace=True) # remove unrelated columns
skills = ['4', '8', '10', '11', '19'] # create a list of all skills we want to predict for later use

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tagged = []
X = []
max_len = 0

for item in text:
    tagged.append('[CLS] ' + item.strip() + ' [SEP]')

for item in tagged:
    tokenized_text = tokenizer.tokenize(item) 
    if len(tokenized_text) > max_len:
        max_len = len(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    X.append(indexed_tokens)

for j in range(len(X)):
    padding = [0] * (max_len - len(X[j]))
    X[j] += padding
    
X = pd.DataFrame(X).iloc[:,:130]

In [25]:
### Grid Search ###
def gridSearch(skillID, model, param):
    y = data[str(skillID)]  # get the target column
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # make train test dataset
    gscv = GridSearchCV(model, param, scoring='f1', n_jobs=-1) # use cross validation to search the best parameters
    gscv.fit(X_train, y_train)
    y_predict = gscv.predict(X_test) # predict the result
    cm = confusion_matrix(y_test, y_predict) # get the confusion matrics
    print('For skill', str(skillID), '\n', 'Confusion Matrix:\n', cm, '\n', 'best parameters:', gscv.best_params_)

In [26]:
rf = RandomForestClassifier(class_weight={0:0.1, 1:0.9})    # create random forest model
rf_param = {'min_samples_split': range(2,7,2),       # parameters for grid search
              'n_estimators': range(50,151,20)}

for skill in skills:
    gridSearch(skill, rf, rf_param)

For skill 4 
 Confusion Matrix:
 [[590  72]
 [283  43]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 50}
For skill 8 
 Confusion Matrix:
 [[628  51]
 [275  34]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 50}
For skill 10 
 Confusion Matrix:
 [[758  11]
 [209  10]] 
 best parameters: {'min_samples_split': 4, 'n_estimators': 50}
For skill 11 
 Confusion Matrix:
 [[488 111]
 [298  91]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 70}
For skill 19 
 Confusion Matrix:
 [[526  93]
 [311  58]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 90}


In [27]:
gb = GradientBoostingClassifier()    # create gradient boosting model
gb_param = {'min_samples_split': range(2,7,2),   # parameters for grid search
              'n_estimators': range(50,151,20)}

for skill in skills:
    gridSearch(skill, gb, gb_param)

For skill 4 
 Confusion Matrix:
 [[627  35]
 [301  25]] 
 best parameters: {'min_samples_split': 4, 'n_estimators': 150}
For skill 8 
 Confusion Matrix:
 [[660  19]
 [301   8]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}
For skill 10 
 Confusion Matrix:
 [[752  17]
 [212   7]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}
For skill 11 
 Confusion Matrix:
 [[529  70]
 [332  57]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}
For skill 19 
 Confusion Matrix:
 [[578  41]
 [341  28]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}


In [30]:
### Use best parameters found on the grid search to build models ###
svc = SVC(gamma='auto')
rf = RandomForestClassifier(class_weight={0:0.1, 1:0.9}, min_samples_split=6, n_estimators=50)
gb = GradientBoostingClassifier(min_samples_split=4, n_estimators=150)
voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')

In [None]:
### train and run the voting classifier, get the confusion matrics ###
for skill in skills:
    y = data[skill]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')
    voting.fit(X_train, y_train)
    y_pred = voting.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print('For skill', skill, '\n', 'Confusion Matrix:\n', cm)