## Load the packages

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Load classifiers
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

## Load the data (2gram model) and labels

In [2]:
X_df = pd.read_csv("./mat_2gram.csv",index_col=0)

In [3]:
X_df

Unnamed: 0,08,100,19,19 vaccine,19 vaccines,2020,ab,able,able wini,absolutely,...,years,years biden,years develop,years lockdown,yes,yes gates,york,young,youre,zero
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# get the 2 grams
features = X_df.columns.values

In [5]:
# Check the 2 grams
features

array(['08', '100', '19', ..., 'young', 'youre', 'zero'], dtype=object)

In [6]:
# Get the data matrix
X_data = X_df.values

In [7]:
print("The number of tweets is :{}, the number of 2 grams is :{}".format(X_data.shape[0],X_data.shape[1]))

The number of tweets is :1699, the number of 2 grams is :2500


In [8]:
# Sanity check

## Sparsity
print("The sparsity of the data matrix: {:.4f}".format(np.sum(X_data==0)/(X_data.shape[0]*X_data.shape[1])))
## Percetage of columns that are all 0s
print("Percetage of columns that are all 0s: {:.4f}".format(np.mean(np.sum(X_data,axis=0) == 0)))
## Percetage of columns that only have 1 entry
print("Percetage of columns that only have 1 entry: {:.4f}".format(np.mean(np.sum(X_data,axis=0) == 1)))

The sparsity of the data matrix: 0.9962
Percetage of columns that are all 0s: 0.3572
Percetage of columns that only have 1 entry: 0.1528


In [9]:
# Load the labels: 0-neutral 1-negative 2-positive
y = pd.read_csv("./labels.csv",index_col=0)

In [10]:
y = y['0'].values

In [11]:
# Sanity check

## Percentage of each label
freq = dict(Counter(y))
print("Percentage of Neutral: {:.4f}".format(freq[0]/len(y)))
print("Percentage of Negative: {:.4f}".format(freq[1]/len(y)))
print("Percentage of Positive: {:.4f}".format(freq[2]/len(y)))


Percentage of Neutral: 0.6221
Percentage of Negative: 0.2301
Percentage of Positive: 0.1477


## Preprocessing

In [12]:
# Remove columns that are empty
non0idx = np.sum(X_data,axis=0) != 0
X_data = X_data[:,non0idx]

In [13]:
print("The number of tweets is :{}, the number of 2 grams is :{}".format(X_data.shape[0],X_data.shape[1]))

The number of tweets is :1699, the number of 2 grams is :1607


In [14]:
# Get the new 2gram names
new_feature_names = features[non0idx]

In [15]:
# scale the data: z-score
X_z = scipy.stats.zscore(X_data,axis=0)

## Models training

### Training Strategy:

1. Data is split into 80% Training, 20% Test. 
2. For each model (Logistic Regression, Random Forest, Adaboost, Support Vector Machine):  
    
    (1). Within the Training Set, run 5-fold cross-validation where the 4 fold is used for real training based on a random parameter set (Randomly selected based on the grid range of the parameters), the remaining 1 fold as the validation set to see the model performance. This is process is repeated 50 times. So in total, 250 models will be trained with different parameter settings. At the end, the best parameter settings will be selected and the best model will be trained with all training data. (This whole step is automated by RandomizedSearchCV)
    
    (2). Run the best model on the test set to get the test ROC

##### 1. Data is split into 80% Training, 20% Test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_z, y, train_size=0.8, random_state=34)

In [17]:
print("Number of Training samples: {}, number of test samples: {}".format(X_train.shape[0],X_test.shape[0]))

Number of Training samples: 1359, number of test samples: 340


##### 2.Run on Each model

##### Random Forest

In [18]:
# Set up the grid range of each parameter in the model

# n_estimators: the number of trees
# max_depth: tree max depth
# min_samples_split: minimum number of samples to split at each node

rf_grid = {'n_estimators': np.arange(10,510,10),
            'max_depth':np.arange(1,31,1) ,
            'min_samples_split': np.arange(2,11,1),
            }

In [20]:
# Within the Training Set, run 5-fold cross-validation where the 4 fold is 
# used for real training based on a random parameter set (Randomly selected based on the grid range of the parameters),
# the remaining 1 fold as the validation set to see the model performance. 
# This is process is repeated 50 times. So in total, 250 models will be trained with different parameter settings. 
# At the end, the best parameter settings will be selected and the best model will be trained with all training data. 
# This whole step is automated by RandomizedSearchCV function

# Specify the random forest model
rf_base = RF(random_state=34)
# Run RandomizedSearchCV within the training set
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 50, cv = 5, random_state = 34, 
                               n_jobs = 6)
rf_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=34),
                   n_iter=50, n_jobs=6,
                   param_distributions={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]),
                                        'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390,
       400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500])},
                   random_state=34)

In [21]:
# Check what is the best parameter setting
rf_random.best_params_

{'n_estimators': 490, 'min_samples_split': 4, 'max_depth': 27}

In [22]:
# Use the best model to predict on the test set
y_pred_rf = rf_random.predict_proba(X_test)

In [23]:
# Check the test ROC_AUC
roc_auc_score(y_test, y_pred_rf,multi_class="ovo") 

0.7530083762476952

##### Logitstic Regression

In [27]:
# Similar steps

In [24]:
lr_grid = {'C':np.logspace(-4,-1,50)}

In [25]:
lr_base = LR(random_state=34)
lr_random = RandomizedSearchCV(estimator = lr_base, param_distributions = lr_grid, 
                               n_iter = 50, cv = 5, random_state = 34, 
                               n_jobs = 6)
lr_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=LogisticRegression(random_state=34),
                   n_iter=50, n_jobs=6,
                   param_distributions={'C': array([0.0001    , 0.00011514, 0.00013257, 0.00015264, 0.00017575,
       0.00020236, 0.000233  , 0.00026827, 0.00030888, 0.00035565,
       0.00040949, 0.00047149, 0.00054287, 0.00062506, 0.00071969,
       0.00082864, 0.0009541 , 0.00109854, 0.00126486, 0.00145635,
       0.00167683, 0.0019307 , 0.002223  , 0.00255955, 0.00294705,
       0.00339322, 0.00390694, 0.00449843, 0.00517947, 0.00596362,
       0.00686649, 0.00790604, 0.00910298, 0.01048113, 0.01206793,
       0.01389495, 0.01599859, 0.0184207 , 0.02120951, 0.02442053,
       0.02811769, 0.03237458, 0.03727594, 0.04291934, 0.04941713,
       0.05689866, 0.06551286, 0.0754312 , 0.08685114, 0.1       ])},
                   random_state=34)

In [26]:
lr_random.best_params_

{'C': 0.005179474679231213}

In [27]:
y_pred_lr = lr_random.predict_proba(X_test)

In [28]:
roc_auc_score(y_test, y_pred_lr,multi_class="ovo") 

0.7602656716892365

##### Adaboost

In [None]:
# Similar steps

In [29]:
ada_grids = {
    'n_estimators': np.arange(10,510,10),
    'learning_rate':np.logspace(-4,0,20),
}

In [30]:
ada_base = AdaBoostClassifier(random_state=34)
ada_random = RandomizedSearchCV(estimator = ada_base, param_distributions = ada_grids, 
                               n_iter = 50, cv = 5, random_state = 34, 
                               n_jobs = 6)
ada_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(random_state=34),
                   n_iter=50, n_jobs=6,
                   param_distributions={'learning_rate': array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
       6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
       4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
       3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
       2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00]),
                                        'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390,
       400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500])},
                   random_state=34)

In [31]:
ada_random.best_params_

{'n_estimators': 360, 'learning_rate': 0.05455594781168514}

In [32]:
y_pred_ada = ada_random.predict_proba(X_test)

In [33]:
roc_auc_score(y_test, y_pred_ada,multi_class="ovo") 

0.6976508360353487

##### Support Vector Machine

In [None]:
# Similar steps

In [42]:
SVM_grids = {
    'C':np.logspace(-3,-1,20),
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma':np.logspace(-3,1,10),
    'max_iter':[10000,15000,20000],
}

In [43]:
SVM_base = SVC(random_state=34,probability=True)
SVM_random = RandomizedSearchCV(estimator = SVM_base, param_distributions = SVM_grids, 
                               n_iter = 50, cv = 5, random_state = 34, 
                               n_jobs = 6)
SVM_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVC(probability=True, random_state=34),
                   n_iter=50, n_jobs=6,
                   param_distributions={'C': array([0.001     , 0.00127427, 0.00162378, 0.00206914, 0.00263665,
       0.00335982, 0.00428133, 0.00545559, 0.00695193, 0.00885867,
       0.01128838, 0.0143845 , 0.01832981, 0.02335721, 0.02976351,
       0.0379269 , 0.0483293 , 0.06158482, 0.078476  , 0.1       ]),
                                        'gamma': array([1.00000000e-03, 2.78255940e-03, 7.74263683e-03, 2.15443469e-02,
       5.99484250e-02, 1.66810054e-01, 4.64158883e-01, 1.29154967e+00,
       3.59381366e+00, 1.00000000e+01]),
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid'],
                                        'max_iter': [10000, 15000, 20000]},
                   random_state=34)

In [44]:
SVM_random.best_params_

{'max_iter': 20000,
 'kernel': 'linear',
 'gamma': 0.001,
 'C': 0.0026366508987303583}

In [45]:
y_pred_svm = SVM_random.predict_proba(X_test)

In [46]:
roc_auc_score(y_test, y_pred_svm,multi_class="ovo") 

0.7530683768834638