### Classification

Loading files and preparing the tweets (pre-processing is applied)

In [103]:
%run scripts/pre_processing.py
df = pd.read_csv('data/tweet_df_class.csv', index_col='Datetime').drop('Unnamed: 0', axis=1)
tweets_text = df.original_text
prep_tweets = tweets_text.apply(pre_processing)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Creating two datasets: one of "malaria tweets" and one of "not malaria tweets"

In [104]:
df['preproc'] = prep_tweets
nodup_df = df.drop_duplicates(subset='original_text')
tmp_tweets = nodup_df[['preproc', 'class']].reset_index(drop=True)
cases_tweets = tmp_tweets[tmp_tweets['class']==1]
not_cases_tweets = tmp_tweets[tmp_tweets['class']==0]

Creating a series with the most frequently used tokens in "malaria tweets" and their frequencies
(bag of words is used)

In [105]:
vocab, bow_matrix = bag_of_words(cases_tweets.preproc)
bow_df = pd.DataFrame(bow_matrix, columns = vocab)
common_words = bow_df.sum(axis=0).sort_values()[-286:]

Creating the dataframe with oversampled tweets

In [106]:
#reading the labeled common words dataframe
words_df = pd.read_csv('data/common_words.csv')

#creating a dictionary with key = part of sentence and value = dataframe of tokens related to that part of sentence
#a is adjective, n is noun, v is verb and r is "other"
words_split = {}
for i, words in words_df.groupby('part'):
    words.freq = words.freq/sum(words.freq) #you may change words.freq with the frequency of only test data tokens
    words_split.update({str(i) : words.drop('part', axis=1)})

#absolute frequencies of each category
ss = words_df.groupby('part').sum()

#creating a series of uncommon words (index) and their relative frequencies 
noise_words = bow_df.sum(axis=0).sort_values()[:-286] #here too you can use noises from only test data tokens
noise_words /= sum(noise_words)

#generating fake tweets from the oversampling function

def fake_tweetter(n):

    fake_tweets = pd.DataFrame(np.array([oversampler(words_split, noise_words) for i in range(n)]).T,
            columns = ['preproc'])
    fake_tweets.preproc = fake_tweets.preproc.str.split()
    fake_tweets.insert(1, 'class', 1)

    return fake_tweets

#tweets with oversampling
z = 1000
oversampled_tweets = pd.concat([tmp_tweets, fake_tweetter(z)]).reset_index(drop=True)

In [107]:
#metrics function

def metrics(test,pred):
    
    precision = precision_score(test[:], pred)
    recall = recall_score(test[:], pred)
    accuracy = accuracy_score(test[:], pred)
    F1= ((2*recall*precision)/(recall+precision))
    return precision, recall, accuracy, F1

Splitting the dataset

In [108]:
# slitting variables with target and dependent variables
X = oversampled_tweets['preproc']
y = oversampled_tweets['class']

#bag of words
vocab, X = bag_of_words(X)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X[:-z], y[:-z], test_size=0.3)

#oversampling
fakes = np.c_[X[-z:], y[-z:]]
i_fakes = np.random.choice(range(z), len(y_train[y_train==0])-len(y_train[y_train==1]), replace=False)
r_fakes = fakes[i_fakes]
X_train = np.r_[X_train, r_fakes[:, :-1]]
y_train = np.r_[y_train, r_fakes[:, -1]]

Naive Bayes

In [109]:
# Creating the Naive Bayes classifier
nb = GaussianNB()

# Fit the classifier to the data
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
NB_score = metrics(y_test,nb_pred)

# score of the NB test
print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*NB_score))

Accuracy : 0.75 
Precision: 0.80
Recall: 0.82
F1: 0.77


Random Forest

In [110]:
# training random forest classifier 
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_score  = metrics(y_test,rf_pred)

# score of the rf test
print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*rf_score))

Accuracy : 0.66 
Precision: 0.88
Recall: 0.78
F1: 0.75


SVM classifier

In [111]:
# training svm classifier SVM
svm = svm.SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_score  = metrics(y_test,svm_pred)

# score of the svm test
print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*svm_score))

Accuracy : 0.71 
Precision: 0.84
Recall: 0.81
F1: 0.77


XGB classifier

In [112]:
# convert format to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Definig parameters for XGBoost
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "error",
    "eta": 0.1,
    "max_depth": 10,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 123
}

# training XGBoost model
xgb = xgb.train(params, dtrain, num_boost_round=100)

# predicting on testset
xgb_pred1 = xgb.predict(dtest)
xgb_pred = [round(value) for value in xgb_pred1]

xgb_score = metrics(y_test,xgb_pred)
print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*xgb_score))

Accuracy : 0.68 
Precision: 0.81
Recall: 0.78
F1: 0.74


Function to do k-folds crossvalidation with oversampling and four metrics

In [113]:
def cross_score(model, X, y, k):

    #create k-folds (test)
    folds = []
    real = np.c_[X[:-z], y[:-z]]
    for kf in range(k):
        i_real = np.random.choice(range(real.shape[0]),int(X[:-z].shape[0]/k),replace=False)
        r_real = real[i_real]
        folds.append(r_real)
        
        #removing selected tweets
        real = np.delete(real, i_real, axis=0)

    #running models
    accuracies = []
    precisions = []
    recalls = []
    F1s = []
    for i in range(k):
        test = folds[i]
        train = np.vstack([folds[j] for j in range(k) if j!=i])
        
        #oversampling
        fakes = np.c_[X[-z:], y[-z:]]
        i_fakes = np.random.choice(range(z), len(train[:, -1][train[:, -1]==0])-len(train[:, -1][train[:, -1]==1]), replace=False)
        r_fakes = fakes[i_fakes]
        train = np.r_[train, r_fakes]
        
        #models
        model.fit(train[:, :-1], train[:, -1])
        pred = model.predict(test[:, :-1])

        #metrics
        precision = precision_score(test[:, -1], pred)
        recall = recall_score(test[:, -1], pred)
        accuracy = accuracy_score(test[:, -1], pred)
        precisions.append(precision)
        recalls.append(recall)
        accuracies.append(accuracy)
        F1s.append((2*recall*precision)/(recall+precision))


    return np.array(accuracies), np.array(precisions), np.array(recalls), np.array(F1s)

Naive Bayes with cross-validation

In [114]:
# training naive bayes classifier
scores = cross_score(nb, X, y, 8)
scores_means_nb = np.array([score.mean() for score in scores])

print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*scores_means_nb))

Accuracy : 0.80 
Precision: 0.68
Recall: 0.82
F1: 0.74


Random Forest with cross-validation

In [115]:
# training random forest classifier 
rf = RandomForestClassifier(n_estimators=100)
scores = cross_score(rf, X, y, 8)
scores_means_rf = np.array([score.mean() for score in scores])

print("""Accuracy : {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*scores_means_rf))

Accuracy : 0.76 
Precision: 0.61
Recall: 0.85
F1: 0.71


SVM classifier with cross-validation

In [116]:
# training SVM with cv
scores = cross_score(svm, X, y, 8)
scores_means_svm = np.array([score.mean() for score in scores])

print("""Accuracy :  {:.2f} 
Precision: {:.2f}
Recall: {:.2f}
F1: {:.2f}""".format(*scores_means_svm))

Accuracy :  0.75 
Precision: 0.61
Recall: 0.81
F1: 0.69


In [121]:
#displaying all results
classifiers_frame = pd.DataFrame([NB_score, rf_score, svm_score, xgb_score, scores_means_nb, scores_means_rf, scores_means_svm],
                                index = ['NB', 'RF', 'SVM', 'XGB', 'nbcross', 'rfcross', 'svmcross'],
                                columns = ['Accuracy', 'Precision', 'Recall', 'F1'])

pd.options.display.float_format = '{:,.2f}'.format
print(classifiers_frame)

          Accuracy  Precision  Recall   F1
NB            0.75       0.80    0.82 0.77
RF            0.66       0.88    0.78 0.75
SVM           0.71       0.84    0.81 0.77
XGB           0.68       0.81    0.78 0.74
nbcross       0.80       0.68    0.82 0.74
rfcross       0.76       0.61    0.85 0.71
svmcross      0.75       0.61    0.81 0.69
