In [None]:
import pandas as pd
import re
import itertools
import nltk
import numpy as np
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Read file

In [None]:
path = "C:/Users/iliapl/Documents/CoronaVirusProject/repo/corona_healthcare_workers/manual_labeling/datasets/Dataset_ready_for_classification/"
output_path = "C:/Users/iliapl/Documents/CoronaVirusProject/repo/corona_healthcare_workers/manual_labeling/datasets/Classification_results/"
num_rounds_finished = 20
healthcare_workers_df = pd.read_csv(path + "classified_hcp_manual_labeling_{0}_rounds_with_features.csv".format(num_rounds_finished))

In [None]:
individuals_only = True
INDIVIDUAL_CONFIDENCE_PERCENT = 90

confidence_percentile = (100 - INDIVIDUAL_CONFIDENCE_PERCENT) / 100

if individuals_only:
        
    # labeled individuals
    labeled_individuals_df = healthcare_workers_df[healthcare_workers_df['Account_Type (Individual/Organization/Other)'] == 'Individual'].reset_index(drop=True)
    
    # unlabeled individuals
    unlabeled_individuals_df = pd.read_csv(output_path + 'unlabeled_predictions_Account_Type_using_author_full_name_and_description_and_SVM_classifier_balanced.csv')
    unlabeled_individuals_df = unlabeled_individuals_df[unlabeled_individuals_df['confidence_to_class_1'] <= confidence_percentile]
    healthcare_workers_df = pd.concat([labeled_individuals_df,
                                      healthcare_workers_df[healthcare_workers_df['username'].isin(unlabeled_individuals_df['author_screen_name'])]])
    healthcare_workers_df = healthcare_workers_df.reset_index(drop=True)
    


## Set target field and binary values

In [None]:
# target_field indicates type of classifier (target variable)

#target_field = "Account_Type (Individual/Organization/Other)"
#mapping_dict = {
#    "Individual": 0,
#    "Organization": 1
#}

target_field = "Occupation_Type (HCP/Not HCP)"
mapping_dict = {
    "Hcp": 0,
    "Not hcp": 1
}


inverse_mapping_dict = {v: k for k, v in mapping_dict.items()}

# Convert category to numbers

In [None]:
# Convert Category to numbers
healthcare_workers_df['author_type_numeric'] = healthcare_workers_df[target_field].map(mapping_dict)
healthcare_workers_df['author_full_name_and_description'] = healthcare_workers_df['author_full_name'].str.cat(healthcare_workers_df['description'],sep=" ")

healthcare_workers_df = healthcare_workers_df.replace(np.nan, '', regex=True)

# labeled_df = healthcare_workers_df[(healthcare_workers_df["author_sub_type"] == 'PERSON') | 
#                                    (healthcare_workers_df["author_sub_type"] == 'ORANIZATION')]

# unlabeled_df = healthcare_workers_df[(healthcare_workers_df["author_sub_type"] != 'PERSON') & 
#                                    (healthcare_workers_df["author_sub_type"] != 'ORANIZATION')]

# y = healthcare_workers_df['author_type_numeric']
# y_labeled = y[labeled_indexes]
# y_unlabeled = y[unlabeled_indexes]

In [None]:
labeled_df = healthcare_workers_df[(healthcare_workers_df["author_type_numeric"] == 0) | 
                              (healthcare_workers_df["author_type_numeric"] == 1)]
labeled_indexes = labeled_df.index
unlabeled_df = healthcare_workers_df.loc[~healthcare_workers_df.index.isin(labeled_indexes)]
unlabeled_indexes = unlabeled_df.index

y = healthcare_workers_df['author_type_numeric']
y_labeled = y[labeled_indexes]
y_labeled = y_labeled.astype('int')

# Set experiments

In [None]:
# experiment_types, extraction types, classifiers, etc. create cartesian products of each configuration

targeted_fields = ['description', 'author_full_name_and_description']
#targeted_fields = ['description']
#targeted_fields = ['author_full_name_and_description']
#experiment_types = ['balanced']
experiment_types = ["balanced"] # there is also inbalanced - if both are set then two separate experiments are run (cartesian products)
#targeted_fields = ['author_full_name_and_description']
is_remove_stopwords = [False]
#feature_extraction_types = ['Doc2Vec']
#feature_extraction_types = ['bag_of_words']
feature_extraction_types = ['tf_idf']
classifiers = ['SVM']
#classifiers = ['SVM']
#vector_sizes = [2, 3 , 5, 10, 20, 50, 100, 300]
vector_sizes = [100]
window_sizes = [2]
#k_folds = [1, 10]
k_folds = [10]
#author_features = ['statuses_count', 'followers_count', 'favourites_count', 'friends_count', 'listed_count','verified']
author_features = ['favourites_count', 'listed_count']
normalize_features = ['statuses_count', 'followers_count', 'favourites_count', 'friends_count', 'listed_count']

with_author_features = False
with_normalized_features = False

experiment_params = [targeted_fields, is_remove_stopwords, feature_extraction_types, classifiers, vector_sizes, 
                     window_sizes, k_folds, experiment_types]
combinations = list(itertools.product(*experiment_params))

In [None]:
if not with_author_features and with_normalized_features:
    raise ValueError('Please set with_normalized_features to False')

In [None]:
# add normalized columns for author features if necessary

if with_author_features:
    healthcare_workers_df['friends_followers_ratio'] = healthcare_workers_df['friends_count'] / healthcare_workers_df['followers_count']
    healthcare_workers_df['friends_followers_ratio'] = healthcare_workers_df['friends_followers_ratio'].replace(np.inf, np.nan)
    healthcare_workers_df['friends_followers_ratio'] = healthcare_workers_df['friends_followers_ratio'].replace(np.nan, healthcare_workers_df['friends_followers_ratio'].max())
    series = healthcare_workers_df['friends_followers_ratio']
    healthcare_workers_df['friends_followers_ratio'] = (series - series.min()) / (series.max() - series.min())
    
    author_features.append('friends_followers_ratio')
    
    if with_normalized_features:
        for column in normalize_features:
            series = healthcare_workers_df[column]
            healthcare_workers_df['{}_normalized'.format(column)] = (series - series.min()) / (series.max() - series.min())

        author_features = ['{}{}'.format(feature, '_normalized' if feature in normalize_features else '') for feature in author_features]

        print(author_features)
    

# Create TF-IDF features

In [None]:
def create_tf_idf_features(df, targeted_field):
    df['selected_field'] = df[targeted_field].apply(str)
    
    corpus = df['selected_field'].to_list()
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    
    return X

In [None]:
def make_cross_validation(clf, X, y, k_fold):
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(clf, X, y, cv=k_fold,scoring=scoring, return_train_score=False)
    print(scores)

    avg_accuracy = scores['test_accuracy'].mean()
    avg_f1 = scores['test_f1_macro'].mean()
    avg_precision = scores['test_precision_macro'].mean()
    avg_recall = scores['test_recall_macro'].mean()
    
    return avg_accuracy, avg_f1, avg_precision, avg_recall

def make_leave_one_out(clf, X, y):
    loo = LeaveOneOut()
    accuracies = []
    f1s = []
    precisions = []
    recalls = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
            
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
            
        accuracy = accuracy_score(y_test, predictions)
        accuracies.append(accuracy)
            
        f1 = f1_score(y_test, predictions, average='weighted')
        f1s.append(f1)
            
        precision = precision_score(y_test, predictions, average='weighted')
        precisions.append(precision)
            
        recall = recall_score(y_test, predictions, average='weighted')
        recalls.append(recall)
            
    accuracies_series = pd.Series(accuracies)
    avg_accuracy = accuracies_series.mean()
    
    f1s_series = pd.Series(f1s)
    avg_f1 = f1s_series.mean()
    
    precisions_series = pd.Series(precisions)
    avg_precision = precisions_series.mean()
    
    recalls_series = pd.Series(recalls)
    avg_recall = recalls_series.mean()
    
    return avg_accuracy, avg_f1, avg_precision, avg_recall          

In [None]:
import os
import scipy
from tqdm import tqdm

#for i in range(10):
for i in range(1):

    results = []

    for combination in tqdm(combinations):
        targeted_field = combination[0]
        is_remove_stop_words = combination[1]
        feature_extraction_type = combination[2]
        selected_classifier = combination[3]
        vector_size = combination[4]
        window_size = combination[5]
        k = combination[6]
        experiment_type = combination[7]

        print("targeted_field: {0}, is_remove_stop_words: {1}, feature_extraction_type: {2}, \
                selected_classifier: {3}, vector_size: {4}, window_size:{5}, \
                k:{6}, experiment_type: {7}".format(targeted_field, is_remove_stop_words, selected_classifier, feature_extraction_type, 
                              vector_size, window_size, k, experiment_type))

        if experiment_type == "inbalanced":
            labeled_df = healthcare_workers_df[(healthcare_workers_df["author_type_numeric"] == 0) | 
                                  (healthcare_workers_df["author_type_numeric"] == 1)]
            labeled_indexes = labeled_df.index

            y = healthcare_workers_df['author_type_numeric']
            y_labeled = y[labeled_indexes]
            y_labeled = y_labeled.astype('int')

            class_0_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 0]
            num_of_class_0 = class_0_df.shape[0]
            print(num_of_class_0)

            class_1_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 1]
            num_of_class_1 = class_1_df.shape[0]
            print(num_of_class_1)

        elif experiment_type == "balanced":

            class_0_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 0]
            num_of_class_0 = class_0_df.shape[0]
            print(num_of_class_0)

            class_1_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 1]
            num_of_class_1 = class_1_df.shape[0]
            print(num_of_class_1)

            if num_of_class_0 != num_of_class_1:
                min_num = min(num_of_class_0, num_of_class_1)
                # for balanced process we want the same amount of instances for each class
                class_0_df = class_0_df.sample(n=min_num)
                class_1_df = class_1_df.sample(n=min_num)
            
            class_0_index = class_0_df.index
            class_1_index = class_1_df.index

            labeled_indexes = class_0_index.union(class_1_index)

            y_labeled = y[labeled_indexes]
            y_labeled = y_labeled.astype('int')

            num_of_class_0 = class_0_index.size
            print(num_of_class_0)
            num_of_class_1 = class_1_index.size
            print(num_of_class_1)


        if feature_extraction_type == "tf_idf":
            X = create_tf_idf_features(healthcare_workers_df, targeted_field)

        elif feature_extraction_type == "Doc2Vec":
            healthcare_workers_df = make_preprocessing(healthcare_workers_df, targeted_field)
            healthcare_workers_df = handle_stop_words(healthcare_workers_df, is_remove_stop_words)
            X, y = create_doc2vec_classifier_and_train_results(users_and_organizations_df, twitter_users_targeted_field_df, vector_size, window_size)

        X_labeled = X[labeled_indexes]
        if with_author_features:
            #X_labeled = pd.concat([pd.DataFrame(X_labeled.toarray()), 
            #                       healthcare_workers_df[author_features][healthcare_workers_df.index.isin(labeled_indexes)].reset_index(drop=True)], axis=1)        
            X_labeled = scipy.sparse.hstack([X_labeled, scipy.sparse.csr_matrix(healthcare_workers_df[author_features][healthcare_workers_df.index.isin(labeled_indexes)].values)])
            print('Added author features')
        #y_labeled = y[labeled_indexes]

        if selected_classifier == 'SVM':
            clf = svm.SVC(kernel='linear', C=1, random_state=0)
        elif selected_classifier == 'RandomForest':
            clf = RandomForestClassifier(max_depth=3, random_state=0)
        elif selected_classifier == 'NB':
            clf = GaussianNB()
            X_labeled = X_labeled.toarray()

        if k != 1:
            avg_accuracy, avg_f1, avg_precision, avg_recall = make_cross_validation(clf, X_labeled, y_labeled, k)
            print('ACCURACY:', avg_accuracy)
        else: # LEAVE ONE OUT
            avg_accuracy, avg_f1, avg_precision, avg_recall = make_leave_one_out(clf, X_labeled, y_labeled)


        result = (target_field, num_of_class_0, num_of_class_1, targeted_field, is_remove_stop_words, 
                  feature_extraction_type, vector_size, window_size, selected_classifier, k, 
                      avg_accuracy, avg_f1, avg_precision, avg_recall)
        results.append(result)

    results_df = pd.DataFrame(results, columns=['target_class_field', '#Class_0', '#Class_1', 'Targeted_Field', 'Remove_Stopwords', 
                                                'Feature_Extraction_Type', 'Vector_Size', 'Window_Size', 'Classifier', 
                                                'K', 'Accuracy', 'F1', 'Precision', 'Recall'])
    
    now = datetime.now()
    date_time = now.strftime("%Y%m%d_%H_%M_%S")

    if not os.path.exists(output_path + 'Round_{0}'.format(num_rounds_finished)):
        os.mkdir(output_path + 'Round_{0}'.format(num_rounds_finished))

    # add target field and round number to file name
    #results_df.to_csv(output_path + "Round_{0}/classifications_round_{1}_target_{2}{3}{4}_{5}{6}.csv".format(num_rounds_finished, 
    #                                                                                                num_rounds_finished,
    #                                                                                      target_field[:target_field.index(' ')],
    #                                                                                            '_individuals_only' if individuals_only else '',
    #                                                                                      '_with_author_features' if with_author_features else '',
    #                                                                                                   'normalized_' if with_normalized_features else '',
    #                                                                                                   date_time), index=False)

    print("Done!")


# Train best classifier and predict on unlabeled

In [None]:
targeted_field = "author_full_name_and_description"
#targeted_field = "description"
selected_classifier = "SVM"
experiment_type = "balanced"

if selected_classifier == 'SVM':
    clf = svm.SVC(kernel='linear', C=1, random_state=0, probability=True)
elif selected_classifier == 'RandomForest':
    clf = RandomForestClassifier(max_depth=3, random_state=0)
    
X = create_tf_idf_features(healthcare_workers_df, targeted_field)

if experiment_type == "balanced":
#     labeled_person_df = healthcare_workers_df[healthcare_workers_df["author_sub_type"] == 'PERSON']
#     balanced_person_df = labeled_person_df.sample(n=119)
#     labeled_persons_index = balanced_person_df.index

#     labeled_organizations_df = healthcare_workers_df[healthcare_workers_df["author_sub_type"] == 'ORANIZATION']
#     labeled_organizations_index = labeled_organizations_df.index

#     labeled_indexes = labeled_persons_index.union(labeled_organizations_index)

#     y_labeled = y[labeled_indexes]
#     y_labeled = y_labeled.astype('int')
        
    class_0_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 0]
    num_of_class_0 = class_0_df.shape[0]
    print(num_of_class_0)
        
    class_1_df = healthcare_workers_df[healthcare_workers_df['author_type_numeric'] == 1]
    num_of_class_1 = class_1_df.shape[0]
    print(num_of_class_1)
        
    if num_of_class_0 != num_of_class_1:
        min_num = min(num_of_class_0, num_of_class_1)
        class_0_df = class_0_df.sample(n=min_num)
        class_0_index = class_0_df.index
            
        class_1_df = class_1_df.sample(n=min_num)
        class_1_index = class_1_df.index
    
    labeled_indexes = class_0_index.union(class_1_index)
    print(labeled_indexes.shape)
        
    y_labeled = y[labeled_indexes]
    y_labeled = y_labeled.astype('int')
        
elif experiment_type == "inbalanced":
    labeled_df = healthcare_workers_df[(healthcare_workers_df["author_type_numeric"] == 0) | 
                              (healthcare_workers_df["author_type_numeric"] == 1)]
    labeled_indexes = labeled_df.index

    y = healthcare_workers_df['author_type_numeric']
    y_labeled = y[labeled_indexes]
    y_labeled = y_labeled.astype('int')

X_labeled = X[labeled_indexes]
if with_author_features:
    X_labeled = pd.concat([pd.DataFrame(X_labeled.toarray()), 
                           healthcare_workers_df[author_features][healthcare_workers_df.index.isin(labeled_indexes)].reset_index(drop=True)], axis=1)        
    print('Added author features')

clf.fit(X_labeled, y_labeled)

In [None]:
X_unlabeled = X[unlabeled_indexes]
if with_author_features:
    X_unlabeled = pd.concat([pd.DataFrame(X_unlabeled.toarray()), 
                           healthcare_workers_df[author_features][healthcare_workers_df.index.isin(unlabeled_indexes)].reset_index(drop=True)], axis=1)        
    print('Added author features')

pred = clf.predict_proba(X_unlabeled)

predict_pobabilities = pred[:, 1]

df = pd.DataFrame(predict_pobabilities, columns=['predict_pob'])

df['automatic_prediction'] = df['predict_pob'].apply(lambda x:1 if x>=0.5 else 0)
predictions_series = df['automatic_prediction']

In [None]:
df

In [None]:
author_screen_names = healthcare_workers_df['author_screen_name'][unlabeled_indexes]
author_full_names = healthcare_workers_df['author_full_name'][unlabeled_indexes]
descriptions = healthcare_workers_df['description'][unlabeled_indexes]
author_full_name_and_descriptions = healthcare_workers_df['author_full_name_and_description'][unlabeled_indexes]
author_osn_ids = healthcare_workers_df['author_osn_id'][unlabeled_indexes]


unlabeled_predictions_df = pd.DataFrame(author_screen_names, columns=['author_screen_name'])

unlabeled_predictions_df['author_full_name'] = author_full_names
unlabeled_predictions_df['description'] = descriptions
unlabeled_predictions_df['author_full_name_and_description'] = author_full_name_and_descriptions
unlabeled_predictions_df['author_osn_id'] = author_osn_ids

unlabeled_predictions_df = unlabeled_predictions_df.reset_index(drop=True)

unlabeled_predictions_df['confidence_to_class_1'] = df['predict_pob']
unlabeled_predictions_df['automatic_prediction'] = df['automatic_prediction']
unlabeled_predictions_df['str_automatic_prediction'] = df['automatic_prediction'].map(inverse_mapping_dict)


#short_unlabeled_predictions_df = unlabeled_predictions_df[((unlabeled_predictions_df['confidence_to_class_1'] >= 0.4) & 
#                                                             (unlabeled_predictions_df['confidence_to_class_1'] < 0.6))]


#unlabeled_predictions_df = unlabeled_predictions_df.sort_values(by=['confidence_to_class_1'])


unlabeled_predictions_df.to_csv(output_path + "unlabeled_predictions_{0}{1}_using_{2}_and_{3}_classifier_{4}.csv".format(target_field[:target_field.index(' ')],
                                                                                                                                  '_individuals_only' if individuals_only else '',
                                                                                                                            targeted_field, 
                                                                                                           selected_classifier,
                                                                                                           experiment_type), 
                                index=False)

In [None]:
unlabeled_predictions_df

## Check performance on users of iterations 1, 2, 3 separately, train on all others

In [None]:
accuracy_scores = []

In [None]:
train_users = pd.Series()

validation_iteration = 1
for i in range(1, num_rounds_finished + 1):
    if i != validation_iteration:
        train_users = train_users.append(pd.read_csv('C:/Users/iliapl/Documents/CoronaVirusProject/repo/corona_healthcare_workers/manual_labeling/datasets/Round_{}/agreed_round_{}_after_summit.csv'.format(i, i))['username'])

train_users_df = healthcare_workers_df[healthcare_workers_df['username'].isin(train_users)]

validation_users = pd.read_csv('C:/Users/iliapl/Documents/CoronaVirusProject/repo/corona_healthcare_workers/manual_labeling/datasets/Round_{}/agreed_round_{}_after_summit.csv'.format(validation_iteration, validation_iteration))['username']
validation_users_df = healthcare_workers_df[healthcare_workers_df['username'].isin(validation_users)]
validation_users_df = validation_users_df[validation_users_df['Account_Type (Individual/Organization/Other)'] != 'Other']

In [None]:
import os
import scipy
from tqdm import tqdm

results = []

targeted_field = 'author_full_name_and_description'
is_remove_stop_words = False
feature_extraction_type = 'tf-idf'
selected_classifier = 'SVM'
vector_size = 100
window_size = 2

class_0_df = train_users_df[train_users_df['author_type_numeric'] == 0]
num_of_class_0 = class_0_df.shape[0]
print(num_of_class_0)

class_1_df = train_users_df[train_users_df['author_type_numeric'] == 1]
num_of_class_1 = class_1_df.shape[0]
print(num_of_class_1)

if num_of_class_0 != num_of_class_1:
    min_num = min(num_of_class_0, num_of_class_1)
    # for balanced process we want the same amount of instances for each class
    class_0_df = class_0_df.sample(n=min_num)
    class_0_index = class_0_df.index

    class_1_df = class_1_df.sample(n=min_num)
    class_1_index = class_1_df.index

labeled_indexes = class_0_index.union(class_1_index)

y_labeled = y[labeled_indexes]
y_labeled = y_labeled.astype('int')

num_of_class_0 = class_0_index.size
print(num_of_class_0)
num_of_class_1 = class_1_index.size
print(num_of_class_1)


X = create_tf_idf_features(healthcare_workers_df, targeted_field)

X_labeled = X[labeled_indexes]
if with_author_features:
    X_labeled = pd.concat([pd.DataFrame(X_labeled.toarray()), 
                           train_users_df[author_features][train_users_df.index.isin(labeled_indexes)].reset_index(drop=True)], axis=1)        
    print('Added author features')
#y_labeled = y[labeled_indexes]

if selected_classifier == 'SVM':
    clf = svm.SVC(kernel='linear', C=1, random_state=0, probability=True)
elif selected_classifier == 'RandomForest':
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    
clf.fit(X_labeled, y_labeled)

svm_coef = clf.coef_[0]

unlabeled_indexes = validation_users_df.index

X_unlabeled = X[unlabeled_indexes]
if with_author_features:
    X_unlabeled = pd.concat([pd.DataFrame(X_unlabeled.toarray()), 
                           healthcare_workers_df[author_features][healthcare_workers_df.index.isin(unlabeled_indexes)].reset_index(drop=True)], axis=1)        
    print('Added author features')

pred = clf.predict_proba(X_unlabeled)

predict_pobabilities = pred[:, 1]

df = pd.DataFrame(predict_pobabilities, columns=['predict_pob'])

df['automatic_prediction'] = df['predict_pob'].apply(lambda x:1 if x>=0.5 else 0)
predictions_series = df['automatic_prediction']

accuracy_scores.append(accuracy_score(healthcare_workers_df.iloc[unlabeled_indexes]['author_type_numeric'].astype('int'), df['automatic_prediction']))
print('Got {} accuracy scores'.format(len(accuracy_scores)))

In [None]:
svm_coef[-10:]

In [None]:
accuracy_scores = pd.Series(accuracy_scores)
print('Mean accuracy:', accuracy_scores.mean())
print('Std accuracy:', accuracy_scores.std())