### Classification

Loading files and preparing the tweets (pre-processing is applied)

In [1]:
%run scripts/pre_processing.py
df = pd.read_csv('data/tweet_df_class.csv', index_col='Datetime').drop('Unnamed: 0', axis=1)
tweets_text = df.original_text
prep_tweets = tweets_text.apply(pre_processing)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Creating two datasets: one of "malaria tweets" and one of "not malaria tweets"

In [2]:
df['preproc'] = prep_tweets
nodup_df = df.drop_duplicates(subset='original_text')
tmp_tweets = nodup_df[['preproc', 'class']].reset_index(drop=True)
cases_tweets = tmp_tweets[tmp_tweets['class']==1]
not_cases_tweets = tmp_tweets[tmp_tweets['class']==0]

Creating a series with the most frequently used tokens in "malaria tweets" and their frequencies
(bag of words is used)

In [3]:
vocab, bow_matrix = bag_of_words(cases_tweets.preproc)
bow_df = pd.DataFrame(bow_matrix, columns = vocab)
common_words = bow_df.sum(axis=0).sort_values()[-286:]

Creating the dataframe with oversampled tweets

In [4]:
#reading the labeled common words dataframe
words_df = pd.read_csv('data/common_words.csv')

#creating a dictionary with key = part of sentence and value = dataframe of tokens related to that part of sentence
#a is adjective, n is noun, v is verb and r is "other"
words_split = {}
for i, words in words_df.groupby('part'):
    words.freq = words.freq/sum(words.freq) #you may change words.freq with the frequency of only test data tokens
    words_split.update({str(i) : words.drop('part', axis=1)})

#absolute frequencies of each category
ss = words_df.groupby('part').sum()

#creating a series of uncommon words (index) and their relative frequencies 
noise_words = bow_df.sum(axis=0).sort_values()[:-286] #here too you can use noises from only test data tokens
noise_words /= sum(noise_words)

#generating fake tweets from the oversampling function

def fake_tweetter(n):

    fake_tweets = pd.DataFrame(np.array([oversampler(words_split, noise_words) for i in range(n)]).T,
            columns = ['preproc'])
    fake_tweets.preproc = fake_tweets.preproc.str.split()
    fake_tweets.insert(1, 'class', 1)

    return fake_tweets

#tweets with oversampling
z = 1000
oversampled_tweets = pd.concat([tmp_tweets, fake_tweetter(z)]).reset_index(drop=True)

Splitting the dataset

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score

# slitting variables with target and dependent variables
X = oversampled_tweets['preproc']
y = oversampled_tweets['class']

#bag of words
vocab, X = bag_of_words(X)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X[:-z], y[:-z], test_size=0.3)

#oversampling
fakes = np.c_[X[-z:], y[-z:]]
i_fakes = np.random.choice(range(z), len(y_train[y_train==0])-len(y_train[y_train==1]), replace=False)
r_fakes = fakes[i_fakes]
X_train = np.r_[X_train, r_fakes[:, :-1]]
y_train = np.r_[y_train, r_fakes[:, -1]]

Random Forest

In [6]:
# training random forest classifier 
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# score of the test
accuracy = clf.score(X_test, y_test)
print("Accuracy : ", accuracy)

Accuracy :  0.8084291187739464


SVM classifier

In [7]:
# training svm classifier SVM
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# accuracy of the classifier
accuracy = clf.score(X_test, y_test)

# accuracy
print("Accuracy : ", accuracy)

NameError: name 'SVC' is not defined

XGB classifier

In [None]:
# convert format to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Definig parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 5,
    'silent': 0,
}

# training XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Faire des prédictions sur le jeu de test
y_pred = bst.predict(dtest)

accuracy = bst.score(X_test, y_test)

In [13]:
X.shape[0]

1870

Function to do k-folds crossvalidation with oversampling and four metrics

In [18]:
def cross_score(model, X, y, k):

    #create k-folds (test)
    folds = []
    real = np.c_[X[:-z], y[:-z]]
    for kf in range(k):
        i_real = np.random.choice(range(real.shape[0]),int(X[:-z].shape[0]/k),replace=False)
        r_real = real[i_real]
        folds.append(r_real)
        
        #removing selected tweets
        real = np.delete(real, i_real, axis=0)

    #running models
    accuracies = []
    precisions = []
    recalls = []
    F1s = []
    for i in range(k):
        test = folds[i]
        train = np.vstack([folds[j] for j in range(k) if j!=i])
        
        #oversampling
        fakes = np.c_[X[-z:], y[-z:]]
        i_fakes = np.random.choice(range(z), len(train[:, -1][train[:, -1]==0])-len(train[:, -1][train[:, -1]==1]), replace=False)
        r_fakes = fakes[i_fakes]
        train = np.r_[train, r_fakes]
        
        #models
        model.fit(train[:, :-1], train[:, -1])
        pred = model.predict(test[:, :-1])

        #metrics
        precision = precision_score(test[:, -1], pred)
        recall = recall_score(test[:, -1], pred)
        accuracy = accuracy_score(test[:, -1], pred)
        precisions.append(precision)
        recalls.append(recall)
        accuracies.append(accuracy)
        F1s.append((2*recall*precision)/(recall+precision))


    return np.array(accuracies), np.array(precisions), np.array(recalls), np.array(F1s)

Random Forest with cross-correlation

In [19]:
# random forest classifier with cross validation
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, X[:-z], y[:-z], cv=8)
accuracies, precisions, recalls, F1s = cross_score(clf, X, y, 8)

# score of the rfcv test
print("Scores : ", accuracies)

# Afficher la moyenne et l'écart-type des scores
print("Accuracy : %0.2f (+/- %0.2f)" % (accuracies.mean(), accuracies.std() * 2))

Scores :  [0.81481481 0.69444444 0.7962963  0.78703704 0.78703704 0.75925926
 0.76851852 0.74074074]
Accuracy : 0.77 (+/- 0.07)


SVM classifier with cross-correlation

In [None]:
# training SVM with cv
clf = SVC(kernel='linear')
scores = cross_val_score(clf, X[:-250], y[:-250], cv=5)

# scores
print("Scores : ", scores)

# scrores with sd
print("Accuracy : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))