In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/util')

In [None]:
import os

os.listdir()

#LOAD DATASET

In [9]:
import json
import os

import networkx as nx
from networkx.readwrite import json_graph

from util import constants
from util.util import tweet_node


def construct_tweet_node_from_json(json_data):
    new_graph = json_graph.tree_graph(json_data)
    root_node = 0
    print(new_graph)
    #print(nx.DiGraph.in_degree(new_graph))
    # root_node = [node for node, in_degree in nx.DiGraph.in_degree(new_graph).items() if in_degree == 0][0]
    node_id_obj_dict = dict()
    dfs_node_construction_helper(root_node, new_graph, set(), node_id_obj_dict)
    return node_id_obj_dict[root_node]


def dfs_node_construction_helper(node_id, graph: nx.DiGraph, visited: set, node_id_obj_dict: dict):
    if node_id in visited:
        return None

    visited.add(node_id)

    tweet_node_obj = construct_tweet_node_from_nx_node(node_id, graph)

    node_id_obj_dict[node_id] = tweet_node_obj

    for neighbor_node_id in graph.successors(node_id):
        if neighbor_node_id not in visited:
            dfs_node_construction_helper(neighbor_node_id, graph, visited, node_id_obj_dict)
            add_node_object_edge(node_id, neighbor_node_id, node_id_obj_dict)


def add_node_object_edge(parent_node_id: int, child_node_id: int, node_id_obj_dict: dict):
    parent_node = node_id_obj_dict[parent_node_id]
    child_node = node_id_obj_dict[child_node_id]

    if child_node.node_type == constants.RETWEET_NODE:
        parent_node.add_retweet_child(child_node)
    elif child_node.node_type == constants.REPLY_NODE:
        parent_node.add_reply_child(child_node)
    else:
        # news node add both retweet and reply edge
        parent_node.add_retweet_child(child_node)
        parent_node.add_reply_child(child_node)


def construct_tweet_node_from_nx_node(node_id, graph: nx.DiGraph):
    return tweet_node(tweet_id=graph.node[node_id]['tweet_id'],
                      created_time=graph.node[node_id]['time'],
                      node_type=graph.node[node_id]['type'],
                      user_id=graph.node[node_id]['user'],
                      botometer_score=graph.node[node_id].get('bot_score', None),
                      sentiment=graph.node[node_id].get('sentiment', None))


def get_dataset_sample_ids(news_source, news_label, dataset_dir="data/sample_ids"):
    sample_list = []
    with open("{}/{}_{}_ids_list.txt".format(dataset_dir, news_source, news_label)) as file:
        for id in file:
            sample_list.append(id.strip())

    return sample_list


def load_from_nx_graphs(dataset_dir: str, news_source: str, news_label: str):
    tweet_node_objects = []

    news_dataset_dir = "{}/{}_{}".format(dataset_dir, news_source, news_label)

    for sample_id in get_dataset_sample_ids(news_source, news_label, "data/sample_ids"):
        with open("{}/{}.json".format(news_dataset_dir, sample_id)) as file:
            tweet_node_objects.append(construct_tweet_node_from_json(json.load(file)))

    return tweet_node_objects


def load_networkx_graphs(dataset_dir: str, news_source: str, news_label: str):
    news_dataset_dir = "{}/{}_{}".format(dataset_dir, news_source, news_label)
    news_samples = []

    for news_file in os.listdir(news_dataset_dir):
        with open(news_dataset_dir+"/"+news_file) as file:
            news_samples.append(json_graph.tree_graph(json.load(file)))

    return news_samples


def load_dataset(dataset_dir: str, news_source: str):
    fake_news_samples = load_networkx_graphs(dataset_dir, news_source, "fake")
    real_news_samples = load_networkx_graphs(dataset_dir, news_source, "real")

    return fake_news_samples, real_news_samples


if __name__ == '__main__':
    fake_samples, real_samples = load_dataset("data/nx_network_data", "politifact")


In [None]:
!pip install vaderSentiment

#LINGUISTIC TEST

In [20]:
import pickle
import queue
from pathlib import Path

import numpy as np
from scipy.spatial.distance import cosine
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_deepest_cascade_reply_nodes_avg_sentiment(prop_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(prop_graph)

    return get_reply_nodes_average_sentiment(deep_cascade)


def get_deepest_cascade_first_level_reply_sentiment(prop_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(prop_graph)
    return get_first_reply_nodes_average_sentiment(deep_cascade)


def get_first_reply_nodes_average_sentiment(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)
    reply_diff_values = list()

    while q.qsize() != 0:
        node = q.get()
        for child in node.reply_children:
            q.put(child)

            if child.node_type == REPLY_NODE and node.node_type == POST_NODE:
                if child.sentiment:
                    reply_diff_values.append(child.sentiment)

    if len(reply_diff_values) == 0:
        return 0
    else:
        return np.mean(np.array(reply_diff_values))


def get_reply_nodes_average_sentiment(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)
    reply_diff_values = list()

    while q.qsize() != 0:
        node = q.get()
        for child in node.reply_children:
            q.put(child)

        if node.node_type == REPLY_NODE:
            if node.sentiment:
                reply_diff_values.append(node.sentiment)

    if len(reply_diff_values) == 0:
        return 0
    else:
        return np.mean(np.array(reply_diff_values))


def get_cosine_similarity(reply_node1, reply_node2, reply_id_index_dict, reply_lat_embeddings):
    try:
        if reply_node1 in reply_id_index_dict and reply_node2 in reply_id_index_dict:
            reply1_idx = reply_id_index_dict[reply_node1]
            reply2_idx = reply_id_index_dict[reply_node2]

            return cosine(reply_lat_embeddings[reply1_idx], reply_lat_embeddings[reply2_idx])

        else:
            return 0
    except:
        return 0


def get_supporting_opposing_replies_ratio(prop_graph: tweet_node, news_source, label):
    q = queue.Queue()

    q.put(prop_graph)
    similarity_values = list()

    reply_id_index_dict = pickle.load(
        open("data/pre_process_data/elmo_features/{}_{}_reply_id_latent_mat_index.pkl".format(news_source, label),
             "rb"))
    reply_content_latent_embeddings = pickle.load(
        open("data/pre_process_data/elmo_features/{}_{}_elmo_lat_embeddings.pkl".format(news_source, label), "rb"))

    while q.qsize() != 0:
        node = q.get()
        for child in node.reply_children:
            q.put(child)

            if node.node_type == REPLY_NODE and child.node_type == REPLY_NODE:
                similarity_values.append(get_cosine_similarity(node.tweet_id, child.tweet_id,
                                                               reply_id_index_dict, reply_content_latent_embeddings))

    if len(similarity_values) == 0:
        return 0
    else:
        supporting = 1
        opposing = 1

        for value in similarity_values:
            if value > 0.5:
                supporting += 1
            else:
                opposing += 1

        return supporting / opposing


def get_reply_nodes_sentiment_ratio(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)
    reply_diff_values = list()

    while q.qsize() != 0:
        node = q.get()
        for child in node.reply_children:
            q.put(child)

        if node.node_type == REPLY_NODE:
            reply_diff_values.append(node.sentiment)

    if len(reply_diff_values) == 0:
        return 0
    else:
        positive_sentiment = 1
        negative_sentiment = 1
        for value in reply_diff_values:
            if value > 0.05:
                positive_sentiment += 1
            elif value < -0.05:
                negative_sentiment += 1

        return positive_sentiment / negative_sentiment


def get_stats_for_features(news_graps: list, get_feature_fun_ref, print=False, feature_name=None):
    result = []
    for graph in news_graps:
        result.append(get_feature_fun_ref(graph))

    if print:
        print_stat_values(feature_name, result)

    return result


def get_all_linguistic_features(news_graphs, micro_features, macro_features):
    all_features = []

    if macro_features:
        retweet_function_references = []

        for function_reference in retweet_function_references:
            features_set = get_stats_for_features(news_graphs, function_reference, print=False, feature_name=None)
            all_features.append(features_set)

    if micro_features:

        reply_function_references = [get_reply_nodes_average_sentiment, get_first_reply_nodes_average_sentiment,
                                     get_deepest_cascade_reply_nodes_avg_sentiment,
                                     get_deepest_cascade_first_level_reply_sentiment]

        for function_reference in reply_function_references:
            features_set = get_stats_for_features(news_graphs, function_reference, print=True, feature_name=None)
            all_features.append(features_set)

    return np.transpose(get_numpy_array(all_features))


def dump_tweet_reply_sentiment(data_dir, out_dir):
    reply_id_content_dict = dict()

    reply_id_content_dict.update(pickle.load(
        open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, "politifact", "fake"), "rb")))

    reply_id_content_dict.update(pickle.load(
        open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, "politifact", "real"), "rb")))

    reply_id_content_dict.update(pickle.load(
        open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, "gossipcop", "fake"), "rb")))

    reply_id_content_dict.update(pickle.load(
        open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, "gossipcop", "real"), "rb")))

    print("Total no. of replies : {}".format(len(reply_id_content_dict)))

    analyzer = SentimentIntensityAnalyzer()

    reply_id_sentiment_output = dict()

    for reply_id, content in tqdm(reply_id_content_dict.items()):
        sentiment_result = analyzer.polarity_scores(content)
        reply_id_sentiment_output[reply_id] = sentiment_result

    pickle.dump(reply_id_sentiment_output, open("{}/all_reply_id_sentiment_result.pkl".format(out_dir), "wb"))


class LinguisticFeatureHelper(BaseFeatureHelper):

    def get_feature_group_name(self):
        return "ling"

    def get_micro_feature_method_references(self):
        method_refs = [get_reply_nodes_sentiment_ratio,
                       get_reply_nodes_average_sentiment,
                       get_first_reply_nodes_average_sentiment,
                       get_deepest_cascade_reply_nodes_avg_sentiment,
                       get_deepest_cascade_first_level_reply_sentiment]

        return method_refs

    def get_micro_feature_method_names(self):
        feature_names = ["Sentiment ratio of all replies",
                         "Average sentiment of all replies",
                         "Average sentiment of first level replies",
                         "Average sentiment of replies in deepest cascade",
                         "Average setiment of first level replies in deepest cascade"]

        return feature_names

    def get_micro_feature_short_names(self):
        feature_names = ["L1", "L2", "L3", "L4", "L5", "L6"]
        return feature_names

    def get_macro_feature_method_references(self):
        method_refs = []

        return method_refs

    def get_macro_feature_method_names(self):
        feature_names = []

        return feature_names

    feature_names = []

    def get_macro_feature_short_names(self):
        feature_names = []
        return feature_names

    def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
                           file_dir="data/features", use_cache=False):
        function_refs = []

        file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
        data_file = Path(file_name)

        if use_cache and data_file.is_file():
            return pickle.load(open(file_name, "rb"))

        if micro_features:
            function_refs.extend(self.get_micro_feature_method_references())

        if len(function_refs) == 0:
            return None

        all_features = []

        for idx in range(len(function_refs)):
            features_set = get_sample_feature_value(prop_graphs, function_refs[idx])
            all_features.append(features_set)

        feature_array = np.transpose(get_numpy_array(all_features))

        pickle.dump(feature_array, open(file_name, "wb"))

        return feature_array


def get_feature_involving_additional_args(prop_graphs, function_reference, news_source, label):
    feature_values = []
    for prop_graph in prop_graphs:
        feature_values.append(function_reference(prop_graph, news_source, label))

    return feature_values


#TEMPORAL ANALYSIS

In [23]:
import queue

import numpy as np


def get_avg_retweet_time_deepest_cascade(news_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(news_graph)
    return get_avg_time_between_retweets(deep_cascade)


def get_time_diff_post_time_last_retweet_time_deepest_cascade(news_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(news_graph)
    first_post_time = deep_cascade.created_time

    last_retweet_time = get_last_retweet_by_time(deep_cascade)
    return last_retweet_time - first_post_time


def get_avg_time_between_replies(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)
    reply_diff_values = list()

    while q.qsize() != 0:
        node = q.get()

        for child in node.children:
            q.put(child)

            if node.node_type == REPLY_NODE and child.node_type == REPLY_NODE:
                reply_diff_values.append(child.created_time - node.created_time)

    if len(reply_diff_values) == 0:
        return 0
    else:
        return np.mean(np.array(reply_diff_values))


def get_avg_time_between_retweets(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)
    retweet_diff_values = list()

    while q.qsize() != 0:
        node = q.get()
        for child in node.retweet_children:
            q.put(child)
            if node.node_type == RETWEET_NODE and child.node_type == RETWEET_NODE:
                retweet_diff_values.append(child.created_time - node.created_time)

    if len(retweet_diff_values) == 0:
        return 0
    else:
        return np.mean(np.array(retweet_diff_values))


def get_last_retweet_by_time(news_graph: tweet_node):
    max_time = 0

    if news_graph:
        for node in news_graph.retweet_children:
            max_time = max(max_time, get_last_retweet_by_time(node))

    if news_graph and news_graph.created_time is not None:
        max_time = max(max_time, news_graph.created_time)

    return max_time


def get_last_reply_by_time(news_graph: tweet_node):
    max_time = 0

    if news_graph:
        for node in news_graph.retweet_children:
            max_time = max(max_time, get_last_retweet_by_time(node))

    if news_graph and news_graph.created_time is not None and news_graph.node_type == REPLY_NODE:
        max_time = max(max_time, news_graph.created_time)

    return max_time


def get_avg_time_between_replies_deepest_cascade(news_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(news_graph, edge_type=REPLY_EDGE)
    return get_avg_time_between_replies(deep_cascade)


def get_time_diff_post_time_last_reply_time_deepest_cascade(news_graph: tweet_node):
    deep_cascade, max_height = get_post_tweet_deepest_cascade(news_graph, edge_type=REPLY_EDGE)
    first_post_time = deep_cascade.created_time

    last_reply_time = get_last_reply_by_time(deep_cascade)

    if last_reply_time == 0:
        return 0
    else:
        return last_reply_time - first_post_time


def get_time_diff_first_post_last_reply(news_graph: tweet_node):
    first_post_time = get_first_post_time(news_graph)
    last_reply_time = get_last_reply_by_time(news_graph)
    if last_reply_time == 0:
        return 0
    else:
        return last_reply_time - first_post_time


def get_first_reply_by_time(news_graph: tweet_node):
    min_time = float("inf")

    if news_graph:
        for node in news_graph.retweet_children:
            min_time = min(min_time, get_first_retweet_by_time(node))

    if news_graph and news_graph.created_time is not None and news_graph.node_type == REPLY_NODE:
        min_time = min(min_time, news_graph.created_time)

    return min_time


def get_first_retweet_by_time(news_graph: tweet_node):
    min_time = float("inf")

    if news_graph:
        for node in news_graph.retweet_children:
            min_time = min(min_time, get_first_retweet_by_time(node))

    if news_graph and news_graph.created_time is not None and news_graph.node_type == RETWEET_NODE:
        min_time = min(min_time, news_graph.created_time)

    return min_time


def get_time_diff_first_post_last_retweet(news_graph: tweet_node):
    first_post_time = get_first_post_time(news_graph)
    last_retweet_time = get_last_retweet_by_time(news_graph)
    return last_retweet_time - first_post_time


def get_time_diff_first_post_first_retweet(news_graph: tweet_node):
    first_post_time = get_first_post_time(news_graph)
    first_retweet_time = get_first_retweet_by_time(news_graph)

    if first_retweet_time == float("inf"):
        return 0

    return first_retweet_time - first_post_time


def get_time_diff_first_last_post_tweet(news_graph: tweet_node):
    post_tweets = list(news_graph.children)

    if len(post_tweets) <= 1:
        # print("only one tweet")
        return 0

    post_tweets = sort_tweet_node_object_by_created_time(post_tweets)

    return post_tweets[len(post_tweets) - 1].created_time - post_tweets[0].created_time


def get_average_time_between_post_tweets(news_graph: tweet_node):
    post_tweets = list(news_graph.children)

    if len(post_tweets) <= 1:
        # print("only one tweet")
        return 0

    post_tweets = sort_tweet_node_object_by_created_time(post_tweets)

    time_diff = []

    for i in range(1, len(post_tweets)):
        time_diff.append(post_tweets[i].created_time - post_tweets[i - 1].created_time)

    return np.mean(time_diff)


def get_stats_for_features(news_graps: list, get_feature_fun_ref, print=False, feature_name=None):
    result = []
    for graph in news_graps:
        result.append(get_feature_fun_ref(graph))

    if print:
        print_stat_values(feature_name, result)

    return result


def print_stat_values(feature_name, values):
    print("=========================================")
    print("Feature : {}".format(feature_name))
    print("Min value : {}".format(min(values)))
    print("Max value : {}".format(max(values)))
    print("Mean value : {}".format(np.mean(np.array(values))))
    print("=========================================")


def graph_has_retweet(news_graph: tweet_node):
    post_tweets = news_graph.children

    for post in post_tweets:
        if len(post.retweet_children) > 0:
            return True

    return False


def count_graph_with_no_retweets(news_graphs: list):
    count = 0

    for prop_graph in news_graphs:
        if not graph_has_retweet(prop_graph):
            count += 1

    print("Graph with no retweets : {}".format(count))


def get_all_temporal_features(prop_graphs, micro_features, macro_features):
    macro_features_functions = [get_average_time_between_post_tweets,
                                get_time_diff_first_last_post_tweet,
                                get_time_diff_first_post_last_retweet,
                                get_time_diff_first_post_first_retweet,
                                get_avg_time_between_retweets,
                                get_avg_retweet_time_deepest_cascade,
                                get_time_diff_post_time_last_retweet_time_deepest_cascade]

    micro_features_functions = [get_avg_time_between_replies,
                                get_time_diff_first_post_last_reply,
                                get_time_diff_post_time_last_reply_time_deepest_cascade]

    function_refs = []

    if macro_features:
        function_refs.extend(macro_features_functions)

    if micro_features:
        function_refs.extend(micro_features_functions)

    all_features = []

    for function_reference in function_refs:
        features_set = get_stats_for_features(prop_graphs, function_reference, print=False, feature_name=None)
        all_features.append(features_set)

    return np.transpose(get_numpy_array(all_features))


def time_difference_between_first_post_node_with_max_out_degree_macro(prop_graph):
    max_out_degree_node, max_out_degree = get_max_out_degree_node(prop_graph, RETWEET_EDGE)
    first_post_time = get_first_post_time(prop_graph)
    if max_out_degree_node is None:
        return 0
    return max_out_degree_node.created_time - first_post_time


def get_time_diff_first_post_first_reply(news_graph):
    first_reply_time = get_first_reply_by_time(news_graph)
    first_post_time = get_first_post_time(news_graph)

    if first_reply_time == float("inf"):
        return 0

    return first_reply_time - first_post_time


class TemporalFeatureHelper(BaseFeatureHelper):

    def get_feature_group_name(self):
        return "temp"

    def get_micro_feature_method_references(self):
        method_refs = [get_avg_time_between_replies,
                       get_time_diff_first_post_first_reply,
                       get_time_diff_first_post_last_reply,
                       get_avg_time_between_replies_deepest_cascade,
                       get_time_diff_post_time_last_reply_time_deepest_cascade]

        return method_refs

    def get_micro_feature_method_names(self):
        feature_names = ["Average time diff between adjacent replies",
                         "Time diff between first tweet posting node and first reply node",
                         "Time diff between first tweet posting news and last reply node",
                         "Average time between adjacent reply nodes in the deepest cascade",
                         "Time diff between first tweet posting news and last reply node in the deepest cascade"]

        return feature_names

    def get_micro_feature_short_names(self):
        feature_names = ["T9", "T10", "T11", "T12", "T13"]
        return feature_names

    def get_macro_feature_method_references(self):
        method_refs = [get_avg_time_between_retweets,
                       get_time_diff_first_post_last_retweet,
                       time_difference_between_first_post_node_with_max_out_degree_macro,  # not implemented
                       get_time_diff_first_last_post_tweet,
                       get_time_diff_post_time_last_retweet_time_deepest_cascade,
                       get_avg_retweet_time_deepest_cascade,
                       get_average_time_between_post_tweets,
                       get_time_diff_first_post_first_retweet]

        return method_refs

    def get_macro_feature_method_names(self):
        feature_names = ["Average time diff between the adjacent retweet nodes in macro network",
                         "Time diff between first tweet and  most recent node in macro network",
                         "Time diff between first tweet posting news and max out degree node",
                         "Time difference between the first and last tweet posting news",
                         "Time diff between tweet posting news and latest retweet node in the deepest cascade",
                         "Average time diff between the adjacent retweet nodes in deepest cascade",
                         "Average time between the tweets posted related to news",
                         "Avg time diff between the tweet post time and the first retweet time"]

        return feature_names

    def get_macro_feature_short_names(self):
        feature_names = ["T1", "T2", "T3", "T4", "T5", "T6", "T7", "T8"]
        return feature_names


#STRUCTURAL ANALYSIS

In [21]:
import pickle
import queue
import time
from pathlib import Path

import numpy as np


def get_post_tweet_deepest_cascade(prop_graph: tweet_node, edge_type=RETWEET_EDGE):
    max_height = 0
    max_height_node = None

    for node in prop_graph.children:
        height = get_tree_height(node, edge_type)
        if height > max_height:
            max_height = height
            max_height_node = node

    return max_height_node, max_height


def get_num_cascade(node: tweet_node, edge_type="retweet"):
    if edge_type == "retweet":
        return len(node.retweet_children)
    elif edge_type == "reply":
        return len(node.reply_children)
    else:
        return len(node.children)


def get_temp_num_cascade(node: tweet_node, edge_type="retweet", max_time=time.time()):
    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    cascade_count = 0

    for child in children:
        if child.created_time <= max_time:
            cascade_count += 1

    return cascade_count


def get_node_count_deepest_cascade(news_graphs: tweet_node, edge_type):
    node_counts = []

    for prop_graph in news_graphs:
        max_height_node, max_height = get_post_tweet_deepest_cascade(prop_graph)

        node_counts.append(get_nodes_count(max_height_node, edge_type))

    return node_counts


def get_max_outdegree(node: tweet_node, edge_type="retweet"):
    if node is None:
        return 0

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    if node.node_type == NEWS_ROOT_NODE:
        max_outdegree = 0
    else:
        max_outdegree = len(children)

    for child in children:
        max_outdegree = max(max_outdegree, get_max_outdegree(child, edge_type))

    return max_outdegree


def get_max_out_degree_node(node: tweet_node, edge_type=RETWEET_EDGE):
    if node is None:
        return None

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    if node.node_type == NEWS_ROOT_NODE:
        max_outdegree_node, max_out_degree = None, 0

    else:
        max_outdegree_node, max_out_degree = node, len(children)

    for child in children:
        child_max_out_degree_node, child_max_out_degree = get_max_out_degree_node(child, edge_type)
        if child_max_out_degree > max_out_degree:
            max_out_degree = child_max_out_degree
            max_outdegree_node = child_max_out_degree_node

    return max_outdegree_node, max_out_degree


def get_target_node_level(root_node: tweet_node, target_node, level=0):
    if root_node is None:
        return 0

    if root_node.tweet_id == target_node.tweet_id:
        return level

    for child in root_node.children:
        res_level = get_target_node_level(child, target_node, level + 1)
        if res_level != 0:
            return res_level

    return 0


def get_depth_of_max_degree_node(prop_graph, edge_type=RETWEET_EDGE):
    max_out_degree_node, max_out_degree = get_max_out_degree_node(prop_graph, edge_type)

    if max_out_degree_node is None:
        return 0

    return get_target_node_level(prop_graph, max_out_degree_node, 0)


def get_max_out_degree_depths(prop_graphs, edge_type=RETWEET_EDGE):
    out_degree_depths = []

    for news_node in prop_graphs:
        out_degree_depths.append(get_depth_of_max_degree_node(news_node, edge_type))

    return out_degree_depths


def get_tree_height(node, edge_type="retweet"):
    if node is None:
        return 0

    max_child_height = 0

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    for child in children:
        max_child_height = max(max_child_height, get_tree_height(child, edge_type))

    return max_child_height + 1


def get_nodes_count(node: tweet_node, edge_type="retweet"):
    if node is None:
        return 0

    node_count = 0

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    for child in children:
        node_count += get_nodes_count(child, edge_type)

    return node_count + 1


def get_temporal_nodes_count(node: tweet_node, edge_type="retweet", max_time=time.time()):
    if node is None or (node.created_time is not None and node.created_time > max_time):
        return 0

    node_count = 0

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    for child in children:
        node_count += get_temporal_nodes_count(child, edge_type, max_time)

    return node_count + 1


def get_node_size_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_tree_node_size = []
    for news_node in prop_graphs:
        temp_node_sizes = []
        first_post_time = get_first_post_time(news_node)
        for time_limit in time_interval_sec:
            node_count = get_temporal_nodes_count(news_node, edge_type, first_post_time + time_limit)
            temp_node_sizes.append(node_count)

        temporal_tree_node_size.append(temp_node_sizes)

    return temporal_tree_node_size


def get_temporal_tree_height(node: tweet_node, edge_type="retweet", max_time=time.time()):
    if node is None or (node.created_time is not None and node.created_time > max_time):
        return 0

    max_child_height = 0

    if edge_type == "retweet":
        children = node.retweet_children
    elif edge_type == "reply":
        children = node.reply_children
    else:
        children = node.children

    for child in children:
        max_child_height = max(max_child_height, get_temporal_tree_height(child, edge_type, max_time))

    return max_child_height + 1


def get_num_cascades_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_num_cascades = []
    for news_node in prop_graphs:
        temp_cascade_num = []
        first_post_time = get_first_post_time(news_node)
        for time_limit in time_interval_sec:
            node_count = get_temp_num_cascade(news_node, edge_type, first_post_time + time_limit)
            temp_cascade_num.append(node_count)

        temporal_num_cascades.append(temp_cascade_num)

    return temporal_num_cascades


def get_tree_heights(news_graphs: list, edge_type):
    heights = []

    for news_node in news_graphs:
        heights.append(get_tree_height(news_node, edge_type))

    return heights


def analyze_height(news_graphs: list, edge_type):
    heights = get_tree_heights(news_graphs, edge_type)

    print("----HEIGHT-----")

    print("max", max(heights))
    print("min", min(heights))
    print("avg", np.mean(heights))


def get_max_outdegrees(news_graphs: list, edge_type):
    max_outdegrees = []

    for news_node in news_graphs:
        max_outdegrees.append(get_max_outdegree(news_node, edge_type))

    return max_outdegrees


def analyze_max_outdegree(news_graphs: list, edge_type):
    max_outdegrees = get_max_outdegrees(news_graphs, edge_type)
    print("-----MAX - OUT DEGREE -----")
    print("max", max(max_outdegrees))
    print("min", min(max_outdegrees))
    print("avg", np.mean(max_outdegrees))


def get_prop_graps_cascade_num(news_graphs: list, edge_type):
    cascade_num = []

    for news_node in news_graphs:
        cascade_num.append(get_num_cascade(news_node, edge_type))

    return cascade_num


def analyze_cascade(news_graphs: list, edge_type):
    cascade_num = get_prop_graps_cascade_num(news_graphs, edge_type)

    print("-----CASCADE-----")
    print("max", max(cascade_num))
    print("min", min(cascade_num))
    print("avg", np.mean(cascade_num))


def get_prop_graphs_node_counts(news_graphs: list, edge_type):
    node_counts = []

    for news_node in news_graphs:
        node_counts.append(get_nodes_count(news_node, edge_type))

    return node_counts


def analyze_node_count(news_graphs: list, edge_type):
    node_counts = get_prop_graphs_node_counts(news_graphs, edge_type)

    print("----NODE SIZE-----")

    print("max", max(node_counts))
    print("min", min(node_counts))
    print("avg", np.mean(node_counts))


def get_height_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_tree_height = []
    for news_node in prop_graphs:
        temp_heights = []
        first_post_time = get_first_post_time(news_node)
        for time_limit in time_interval_sec:
            height = get_temporal_tree_height(news_node, edge_type, first_post_time + time_limit)
            temp_heights.append(height)

        temporal_tree_height.append(temp_heights)

    return temporal_tree_height


def analyze_height_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_tree_height = get_height_by_time(prop_graphs, edge_type, time_interval_sec)

    temporal_tree_height = np.array([np.array(val) for val in temporal_tree_height])

    for idx, time_limit_sec in enumerate(time_interval_sec):
        heights_at_time = temporal_tree_height[:, idx]
        print("Time limit: {}".format(time_limit_sec))
        print("Min height : {}".format(np.min(heights_at_time)))
        print("Max height : {}".format(np.max(heights_at_time)))
        print("Mean height : {}".format(np.mean(heights_at_time)))
        print(flush=True)


def analyze_cascade_num_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_cascade_num = get_num_cascades_by_time(prop_graphs, edge_type, time_interval_sec)

    temporal_cascade_num = np.array([np.array(val) for val in temporal_cascade_num])

    for idx, time_limit_sec in enumerate(time_interval_sec):
        heights_at_time = temporal_cascade_num[:, idx]
        print("Time limit: {}".format(time_limit_sec))
        print("Min num cascade : {}".format(np.min(heights_at_time)))
        print("Max num cascade : {}".format(np.max(heights_at_time)))
        print("Mean num cascade : {}".format(np.mean(heights_at_time)))
        print(flush=True)


def analyze_node_size_by_time(prop_graphs: list, edge_type: str, time_interval_sec: list):
    temporal_tree_node_sizes = get_node_size_by_time(prop_graphs, edge_type, time_interval_sec)

    temporal_tree_node_sizes = np.array([np.array(val) for val in temporal_tree_node_sizes])

    for idx, time_limit_sec in enumerate(time_interval_sec):
        heights_at_time = temporal_tree_node_sizes[:, idx]
        print("Time limit: {}".format(time_limit_sec))
        print("Min node size : {}".format(np.min(heights_at_time)))
        print("Max node size : {}".format(np.max(heights_at_time)))
        print("Mean node size : {}".format(np.mean(heights_at_time)))
        print(flush=True)


def get_first_post_time(node: tweet_node):
    first_post_time = time.time()

    for child in node.children:
        first_post_time = min(first_post_time, child.created_time)

    return first_post_time


def get_num_of_cascades_with_retweets(root_node: tweet_node):
    num_cascades = 0
    for node in root_node.retweet_children:
        if len(node.retweet_children) > 0:
            num_cascades += 1

    return num_cascades


def get_prop_graphs_num_of_cascades_with_retweets(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_num_of_cascades_with_retweets)


def get_fraction_of_cascades_with_retweets(root_node: tweet_node):
    total_cascades = len(root_node.retweet_children)

    cascade_with_retweet = 0
    for node in root_node.retweet_children:
        if len(node.retweet_children) > 0:
            cascade_with_retweet += 1

    return cascade_with_retweet / total_cascades


def get_prop_graphs_fraction_of_cascades_with_retweets(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_fraction_of_cascades_with_retweets)


def get_num_of_cascades_with_replies(root_node: tweet_node):
    num_cascades = 0
    for node in root_node.reply_children:
        if len(node.reply_children) > 0:
            num_cascades += 1

    return num_cascades


def get_prop_graphs_num_of_cascades_with_replies(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_num_of_cascades_with_replies)


def get_fraction_of_cascades_with_replies(root_node: tweet_node):
    total_cascades = len(root_node.reply_children)

    cascade_with_replies = 0
    for node in root_node.reply_children:
        if len(node.reply_children) > 0:
            cascade_with_replies += 1

    return cascade_with_replies / total_cascades


def get_users_in_network(prop_graph: tweet_node, edge_type=None):
    q = queue.Queue()

    q.put(prop_graph)

    users_list = list()

    while q.qsize() != 0:
        node = q.get()

        if edge_type == RETWEET_EDGE:
            children = node.retweet_children
        elif edge_type == REPLY_EDGE:
            children = node.reply_children
        else:
            children = node.children

        for child in children:
            q.put(child)
            if child.user_id is not None:
                users_list.append(child.user_id)

    return users_list


def get_users_replying_in_prop_graph(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)

    users_list = list()

    while q.qsize() != 0:
        node = q.get()

        for child in node.reply_children:
            q.put(child)
            if child.node_type == REPLY_NODE and child.user_id is not None:
                users_list.append(child.user_id)

    return users_list


def get_users_retweeting_in_prop_graph(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)

    users_list = list()

    while q.qsize() != 0:
        node = q.get()

        for child in node.retweet_children:
            q.put(child)
            if child.node_type == RETWEET_NODE and child.user_id is not None:
                users_list.append(child.user_id)

    return users_list


def get_user_names_retweeting_in_prop_graph(prop_graph: tweet_node):
    q = queue.Queue()

    q.put(prop_graph)

    users_list = list()

    while q.qsize() != 0:
        node = q.get()

        for child in node.retweet_children:
            q.put(child)
            if child.node_type == RETWEET_NODE and child.user_name is not None:
                users_list.append(child.user_name)

    return users_list


def get_num_user_retweet_and_reply(prop_graph: tweet_node):
    retweet_users = set(get_users_retweeting_in_prop_graph(prop_graph))
    replying_users = set(get_users_replying_in_prop_graph(prop_graph))

    return len(retweet_users.intersection(replying_users))


def get_ratio_of_retweet_to_reply(prop_graph: tweet_node):
    retweet_users = set(get_users_retweeting_in_prop_graph(prop_graph))
    replying_users = set(get_users_replying_in_prop_graph(prop_graph))

    return (len(retweet_users) + 1) / (len(replying_users) + 1)


def get_prop_graphs_num_user_retweet_and_reply(prop_graphs, edge_type=None):
    return get_sample_feature_value(prop_graphs, get_num_user_retweet_and_reply)


def get_prop_graphs_ratio_of_retweet_to_reply(prop_graphs, edge_type=None):
    return get_sample_feature_value(prop_graphs, get_ratio_of_retweet_to_reply)


def get_unique_users_in_graph(prop_graph: tweet_node, edge_type=None):
    user_list = get_users_in_network(prop_graph, edge_type)
    return len(set(user_list))


def get_fraction_of_unique_users(prop_graph: tweet_node, edge_type=None):
    user_list = get_users_in_network(prop_graph, edge_type)
    try:
        return len(set(user_list)) / len(user_list)
    except:
        print("Exception in fraction of unique users")
        return 0


def get_num_bot_users(prop_graph: tweet_node):
    q = queue.Queue()
    q.put(prop_graph)

    num_bot_users = 0

    while q.qsize() != 0:
        node = q.get()

        for child in node.retweet_children:
            q.put(child)
            if child.node_type == RETWEET_NODE and child.user_id is not None:
                if child.botometer_score and child.botometer_score > 0.5:
                    num_bot_users += 1

    return num_bot_users


def get_fraction_of_bot_users_retweeting(prop_graph: tweet_node):
    q = queue.Queue()
    q.put(prop_graph)

    num_bot_users = 1
    num_human_users = 1

    while q.qsize() != 0:
        node = q.get()

        for child in node.retweet_children:
            q.put(child)
            if child.node_type == RETWEET_NODE and child.user_id is not None:
                if child.botometer_score:
                    if child.botometer_score > 0.5:
                        num_bot_users += 1
                    else:
                        num_human_users += 1

    return num_bot_users / (num_human_users + num_bot_users)


def get_prop_graphs_num_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None):
    global user_id_bot_score_dict
    return get_sample_feature_value(prop_graphs, get_num_bot_users)


def get_prop_graphs_fraction_of_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None):
    return get_sample_feature_value(prop_graphs, get_fraction_of_bot_users_retweeting)


def get_breadth_at_each_level(prop_graph, edge_type=RETWEET_EDGE):
    q1 = queue.Queue()
    q2 = queue.Queue()

    q1.put(prop_graph)

    level_breadths = []

    while q1.qsize() != 0 or q2.qsize() != 0:

        if q1.qsize() != 0:
            level_breadths.append(q1.qsize())

        while q1.qsize() != 0:
            node = q1.get()

            if edge_type == RETWEET_EDGE:
                children = node.retweet_children
            elif edge_type == REPLY_EDGE:
                children = node.reply_children
            else:
                children = node.children

            for child in children:
                q2.put(child)

        if q2.qsize() != 0:
            level_breadths.append(q2.qsize())

        while q2.qsize() != 0:
            node = q2.get()

            if edge_type == RETWEET_EDGE:
                children = node.retweet_children
            elif edge_type == REPLY_EDGE:
                children = node.reply_children
            else:
                children = node.children

            for child in children:
                q1.put(child)

    return max(level_breadths)


def get_prop_graphs_max_breadth(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_breadth_at_each_level)


def get_prop_graphs_num_unique_users(prop_graphs, edge_type=RETWEET_EDGE):
    unique_users_cnts = []

    for graph in prop_graphs:
        unique_users_cnts.append(get_unique_users_in_graph(graph, edge_type))

    return unique_users_cnts


def get_prop_graphs_fraction_of_unique_users(prop_graphs, edge_type=RETWEET_EDGE):
    unique_users_fract_cnts = []

    for graph in prop_graphs:
        unique_users_fract_cnts.append(get_fraction_of_unique_users(graph, edge_type))

    return unique_users_fract_cnts


def get_prop_graphs_fraction_of_cascades_with_replies(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_fraction_of_cascades_with_replies)


def get_prop_graphs_min_time_to_reach_level_1(news_graphs: list, edge_type=None):
    return get_sample_feature_value(news_graphs, get_min_time_to_reach_level_1)


def get_prop_graphs_min_time_to_reach_level_2(news_graphs: list, edge_type=None):
    return get_sample_feature_value(news_graphs, get_min_time_to_reach_level_2)


def get_min_time_to_reach_level_1(new_graph: tweet_node):
    return get_min_time_to_reach_level(new_graph, 1)


def get_min_time_to_reach_level_2(news_graph: tweet_node):
    return get_min_time_to_reach_level(news_graph, 2)


def get_min_time_to_reach_level(new_graph: tweet_node, target_depth):
    time_to_reach_depth = []
    for post_node in new_graph.retweet_children:
        post_time = post_node.created_time
        level_node_times = dfs_traverse(post_node, 0, target_depth)
        if len(level_node_times) > 0:
            time_to_reach_depth.append(min(level_node_times) - post_time)

    if len(time_to_reach_depth) > 0:
        return np.mean(time_to_reach_depth)
    else:
        return 0


def get_unique_users_untill_level(new_graph: tweet_node, target_depth):
    dfs_traverse_get_users(new_graph, target_depth)


def dfs_traverse(node: tweet_node, level: int, target: int):
    result = []

    if level == target:
        return [node.created_time]

    elif level > target:
        return None

    else:
        for child in node.retweet_children:
            level_nodes = dfs_traverse(child, level + 1, target)
            if level_nodes:
                result.extend(level_nodes)

    return result


def get_num_unique_users_under_level_2(node: tweet_node, edge_type=None):
    return len(dfs_traverse_get_users(node, 0, 2))


def get_num_unique_users_under_level_4(node: tweet_node, edge_type=None):
    return len(dfs_traverse_get_users(node, 0, 4))


def get_prop_graphs_num_unique_user_under_level_2(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_num_unique_users_under_level_2)


def get_prop_graphs_num_unique_user_under_level_4(prop_graphs, edge_type=RETWEET_EDGE):
    return get_sample_feature_value(prop_graphs, get_num_unique_users_under_level_4)


def dfs_traverse_get_users(node: tweet_node, level: int, target: int):
    result = list()

    if level > target:
        return None

    else:
        result.append(node.user_id)

        for child in node.retweet_children:
            level_nodes = dfs_traverse(child, level + 1, target)
            if level_nodes:
                result.extend(level_nodes)

    return result


def get_all_structural_features(news_graphs, micro_features, macro_features):
    all_features = []
    target_edge_type = RETWEET_EDGE

    if macro_features:
        retweet_function_references = [get_tree_heights, get_prop_graphs_node_counts, get_prop_graps_cascade_num,
                                       get_max_outdegrees, get_num_of_cascades_with_retweets,
                                       get_fraction_of_cascades_with_retweets]
        for function_ref in retweet_function_references:
            features = function_ref(news_graphs, target_edge_type)
            all_features.append(features)

    if micro_features:
        target_edge_type = REPLY_EDGE

        reply_function_references = [get_tree_heights, get_prop_graphs_node_counts, get_max_outdegrees]
        for function_ref in reply_function_references:
            features = function_ref(news_graphs, target_edge_type)
            all_features.append(features)

    return np.transpose(get_numpy_array(all_features))


class StructureFeatureHelper(BaseFeatureHelper):

    def get_feature_group_name(self):
        return "struct"

    def get_micro_feature_method_references(self):
        method_refs = [get_tree_heights, get_prop_graphs_node_counts, get_max_outdegrees,
                       get_prop_graphs_num_of_cascades_with_replies,
                       get_prop_graphs_fraction_of_cascades_with_replies]

        return method_refs

    def get_micro_feature_method_names(self):
        feature_names = ["Micro - Tree depth", "Micro - No of nodes", "Micro - Maximum out degree",
                         "No. of cascades with replies", "Fraction of cascades with replies"]
        return feature_names

    def get_micro_feature_short_names(self):
        feature_names = ["S10", "S11", "S12", "S13", "S14"]
        return feature_names

    def get_macro_feature_method_references(self):
        method_refs = [get_tree_heights, get_prop_graphs_node_counts, get_max_outdegrees, get_prop_graps_cascade_num,
                       get_max_out_degree_depths,
                       get_prop_graphs_num_of_cascades_with_retweets,
                       get_prop_graphs_fraction_of_cascades_with_retweets,
                       get_prop_graphs_num_bot_users_retweeting,
                       get_prop_graphs_fraction_of_bot_users_retweeting,
                       ]

        return method_refs

    def get_macro_feature_method_names(self):
        feature_names = ["Macro - Tree depth",
                         "Macro - No of nodes",
                         "Macro - Maximum out degree",
                         "Macro - No of cascades",
                         "Macro - Max out degree node's level",
                         "No. of cascades with retweets",
                         "Fraction of cascades with retweets",
                         "No. of bot users retweeting",
                         "Fraction of bot user retweeting"]

        return feature_names

    feature_names = []

    def get_macro_feature_short_names(self):
        feature_names = ["S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9"]
        return feature_names

    def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
                           file_dir="data/features", use_cache=False):
        all_features = []

        file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
        data_file = Path(file_name)

        if use_cache and data_file.is_file():
            return pickle.load(open(file_name, "rb"))

        if micro_features:
            target_edge_type = REPLY_EDGE

            reply_function_references = self.get_micro_feature_method_references()
            for function_ref in reply_function_references:
                features = function_ref(prop_graphs, target_edge_type)
                all_features.append(features)

        if macro_features:
            target_edge_type = RETWEET_EDGE
            retweet_function_references = self.get_macro_feature_method_references()
            for function_ref in retweet_function_references:
                features = function_ref(prop_graphs, target_edge_type)
                all_features.append(features)

        feature_array = np.transpose(get_numpy_array(all_features))

        pickle.dump(feature_array, open(file_name, "wb"))

        return feature_array


NameError: ignored

#STAT TEST

In [14]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats


def perform_t_test(samples1, samples2):
    [t_val, p_val] = stats.ttest_ind(samples1, samples2, equal_var=True)
    print("t-Statistic value : {}".format(t_val))
    print("p - value : {}".format(p_val))
    print("=====================================")


def plot_normal_distributions(samples1, samples2):
    fit1 = stats.norm.pdf(samples1, np.mean(samples1), np.std(samples1))
    fit2 = stats.norm.cdf(samples2, np.mean(samples2), np.std(samples2))

    plt.plot(sorted(samples1), fit1, 'red')
    plt.plot(sorted(samples2), fit2, 'blue')
    plt.show()


def get_box_plots(samples1, samples2, save_folder, title=None, file_name=None):
    all_data = [samples1, samples2]
    labels = ['Fake', 'Real']
    # plt.box(None)

    font = {'family': 'normal',
            'weight': 'semibold',
            'size': 13}
    #
    matplotlib.rc('font', **font)

    # plt.xlabel('l', fontsize=18)
    # plt.ylabel('ylabel', fontsize=16)
    plt.tight_layout()
    plt.figure(figsize=(1.5, 4))

    fig = plt.figure(1, figsize=(1, 3), frameon=False)

    ax1 = fig.add_subplot(111)
    bplot1 = ax1.boxplot(all_data,
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=labels,  # will be used to label x-ticks
                         showfliers=False,
                         positions=[0, 0.5])


    # plt.title(title)
    # title = ax1.set_title("\n".join(wrap(title,50)), fontdict={'fontweight': 'semibold'})
    [t_val, p_val] = stats.ttest_ind(samples1, samples2, equal_var=True)

    if p_val > 0.05:
        title = ax1.set_title(file_name, fontdict={'fontweight': 'bold', 'fontsize': 16})
    else:
        ax1.set_title(r'' + file_name + ' $\mathbf{^{*}}$', fontdict={'fontweight': 'bold', 'fontsize': 16})
    # fill with colors

    colors = ['pink', 'lightblue', 'lightgreen']
    for patch, color in zip(bplot1['boxes'], colors):
        patch.set_facecolor(color)

    fig.savefig('{}/{}.png'.format(save_folder, file_name))

    fig.show()
    plt.close()


def get_box_plots_mod(samples1, samples2, save_folder, file_name=None):
    all_data = np.transpose(np.array([samples1, samples2]))
    labels = ['Fake', 'Real']
    df = pd.DataFrame(all_data, columns=labels)
    import seaborn as sns
    import matplotlib.pyplot as plt
    from matplotlib import pyplot

    fig, ax = pyplot.subplots(figsize=(3, 3.5))

    my_pal = {"Fake": "pink", "Real": "lightblue", }

    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    ax = sns.boxplot(data=df, width=0.3, palette=my_pal,  showfliers=False)

    colors = ['pink', 'lightblue']
    for idx,  patch in enumerate(ax.artists):
        r, g, b, a = patch.get_facecolor()
        patch.set_facecolor(colors[idx])

    [t_val, p_val] = stats.ttest_ind(samples1, samples2, equal_var=True)

    if p_val > 0.05:
        title = plt.title(file_name, fontdict={'fontweight': 'bold', 'fontsize': 16})
    else:
        plt.title(r'' + file_name + ' $\mathbf{^{*}}$', fontdict={'fontweight': 'bold', 'fontsize': 16})

    plt.savefig('{}/{}.png'.format(save_folder, file_name),bbox_inches="tight")

    plt.show()

    return

    font = {'family': 'normal',
            'weight': 'semibold',
            'size': 13}
    #
    matplotlib.rc('font', **font)

    # plt.xlabel('l', fontsize=18)
    # plt.ylabel('ylabel', fontsize=16)
    plt.tight_layout()
    plt.figure(figsize=(1.5, 4))

    fig = plt.figure(1, figsize=(1, 3), frameon=False)

    ax1 = fig.add_subplot(111)
    # rectangular box plot
    bplot1 = ax1.boxplot(all_data,
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=labels,  # will be used to label x-ticks
                         showfliers=False,
                         positions=[0, 0.5])

    [t_val, p_val] = stats.ttest_ind(samples1, samples2, equal_var=True)

    if p_val > 0.05:
        title = ax1.set_title(file_name, fontdict={'fontweight': 'bold', 'fontsize': 16})
    else:
        ax1.set_title(r'' + file_name + ' $\mathbf{^{*}}$', fontdict={'fontweight': 'bold', 'fontsize': 16})
    # fill with colors

    colors = ['pink', 'lightblue', 'lightgreen']
    for patch, color in zip(bplot1['boxes'], colors):
        patch.set_facecolor(color)

    fig.savefig('{}/{}.png'.format(save_folder, file_name))

    fig.show()
    plt.close()


if __name__ == "__main__":
    import seaborn as sns

    all_data = np.transpose(np.array([np.random.rand(2000, ), np.random.rand(2000, )]))
    labels = ['Fake', 'Real']
    df = pd.DataFrame(all_data, columns=labels)
    my_pal = {"Fake": "pink", "Real": "lightblue", }

    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    tips = sns.load_dataset("tips")
    ax = sns.violinplot(data=df, palette=my_pal, width=0.3, showfliers=False)

    plt.show()


#ANALYSIS UTIL

In [19]:
import errno
import os
import pickle
from abc import ABCMeta, abstractmethod
from pathlib import Path

import numpy as np
from sklearn.utils import resample


class BaseFeatureHelper(metaclass=ABCMeta):

    @abstractmethod
    def get_feature_group_name(self):
        pass

    @abstractmethod
    def get_micro_feature_method_references(self):
        pass

    @abstractmethod
    def get_micro_feature_method_names(self):
        pass

    @abstractmethod
    def get_micro_feature_short_names(self):
        pass

    @abstractmethod
    def get_macro_feature_method_references(self):
        pass

    @abstractmethod
    def get_macro_feature_method_names(self):
        pass

    @abstractmethod
    def get_macro_feature_short_names(self):
        pass

    def get_dump_file_name(self, news_source, micro_features, macro_features, label, file_dir):
        file_tags = [news_source, label, self.get_feature_group_name()]
        if micro_features:
            file_tags.append("micro")

        if macro_features:
            file_tags.append("macro")

        return "{}/{}.pkl".format(file_dir, "_".join(file_tags))

    def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
                           file_dir="data/features", use_cache=False):
        function_refs = []

        file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
        data_file = Path(file_name)

        if use_cache and data_file.is_file():
            return pickle.load(open(file_name, "rb"))

        if micro_features:
            function_refs.extend(self.get_micro_feature_method_references())

        if macro_features:
            function_refs.extend(self.get_macro_feature_method_references())

        if len(function_refs) == 0:
            return None

        all_features = []

        for function_reference in function_refs:
            features_set = get_sample_feature_value(prop_graphs, function_reference)
            all_features.append(features_set)

        feature_array = np.transpose(get_numpy_array(all_features))

        pickle.dump(feature_array, open(file_name, "wb"))

        return feature_array

    def get_feature_names(self, micro_features, macro_features):
        features_names = []
        short_feature_names = []

        if micro_features:
            features_names.extend(self.get_micro_feature_method_names())
            short_feature_names.extend(self.get_micro_feature_short_names())

        if macro_features:
            features_names.extend(self.get_macro_feature_method_names())
            short_feature_names.extend(self.get_macro_feature_short_names())

        return features_names, short_feature_names

    def print_statistics_for_all_features(self, feature_array=None, prop_graphs=None, micro_features=None,
                                          macro_features=None):

        if feature_array is None:
            feature_array = self.get_features_array(prop_graphs, micro_features, macro_features)

        [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features)

        for idx in range(len(feature_names)):
            feature_values = feature_array[:, idx]
            print_stat_values(feature_names[idx], feature_values, short_feature_names[idx])

    def save_blox_plots_for_features(self, fake_feature_array=None, real_feature_array=None, fake_prop_graphs=None,
                                     real_prop_graphs=None, micro_features=None, macro_features=None, save_folder=None):

        if fake_feature_array is None:
            fake_feature_array = self.get_features_array(fake_prop_graphs, micro_features, macro_features)
            real_feature_array = self.get_features_array(real_prop_graphs, micro_features, macro_features)

        [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features)

        for idx in range(len(feature_names)):
            fake_feature_values = fake_feature_array[:, idx]
            real_feature_values = real_feature_array[:, idx]

            get_box_plots_mod(fake_feature_values, real_feature_values, save_folder, feature_names[idx],
                          short_feature_names[idx])

    def get_feature_significance_t_tests(self, fake_feature_array, real_feature_array, micro_features=None,
                                         macro_features=None):
        [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features)

        for idx in range(len(feature_names)):
            fake_feature_values = fake_feature_array[:, idx]
            real_feature_values = real_feature_array[:, idx]
            print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx]))
            perform_t_test(fake_feature_values, real_feature_values)

    def get_feature_significance_bootstrap_tests(self, fake_feature_array, real_feature_array, micro_features=None,
                                                 macro_features=None):

        [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features)

        for idx in range(len(feature_names)):
            fake_feature_values = fake_feature_array[:, idx]
            real_feature_values = real_feature_array[:, idx]

            perms_fake = []
            perms_real = []

            combined = np.concatenate((fake_feature_values, real_feature_values), axis=0)

            print("combined shape : ", combined.shape)

            for i in range(10000):
                np.random.seed(i)
                perms_fake.append(resample(combined, n_samples=len(fake_feature_values)))
                perms_real.append(resample(combined, n_samples=len(real_feature_values)))

            dif_bootstrap_means = (np.mean(perms_fake, axis=1) - np.mean(perms_real, axis=1))
            print("diff bootstrap means : ", dif_bootstrap_means.shape)

            obs_difs = (np.mean(fake_feature_values) - np.mean(real_feature_values))

            p_value = dif_bootstrap_means[dif_bootstrap_means >= obs_difs].shape[0] / 10000

            print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx]))
            print("t- value : {}   p-value : {}".format(obs_difs, p_value))


def get_sample_feature_value(news_graps: list, get_feature_fun_ref):
    result = []
    for graph in news_graps:
        result.append(get_feature_fun_ref(graph))

    return result


def create_dir(dir_name):
    if not os.path.exists(dir_name):
        try:
            os.makedirs(dir_name)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise



def get_epoch_timestamp_from_retweet(retweet):
    return twitter_datetime_str_to_object(retweet["created_at"])


def sort_retweet_object_by_time(retweets: list):
    retweets.sort(key=get_epoch_timestamp_from_retweet)

    return retweets


def get_noise_news_ids():
    with open("data/news_id_ignore_list") as file:
        lines = file.readlines()
        return [line.strip() for line in lines]


def load_prop_graph(data_folder, news_source, news_label):
    news_graphs = pickle.load(open("{}/{}_{}_news_prop_graphs.pkl".format(data_folder, news_source, news_label), "rb"))
    return news_graphs


def remove_prop_graph_noise(news_graphs, noise_ids):
    noise_ids = set(noise_ids)
    return [graph for graph in news_graphs if graph.tweet_id not in noise_ids]


def sort_tweet_node_object_by_created_time(tweet_nodes: list):
    tweet_nodes.sort(key=lambda x: x.created_time)

    return tweet_nodes


def equal_samples(sample1, sample2):
    target_len = min(len(sample1), len(sample2))

    np.random.seed(0)

    np.random.shuffle(sample1)
    np.random.shuffle(sample2)

    return sample1[:target_len], sample2[:target_len]


# def get_propagation_graphs(data_folder, news_source):
#     fake_propagation_graphs = load_prop_graph(data_folder, news_source, "fake")
#     real_propagation_graphs = load_prop_graph(data_folder, news_source, "real")
#
#     print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
#     print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
#
#     fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids())
#     real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids())
#
#     print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
#     print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
#     print(flush=True)
#
#     return fake_propagation_graphs, real_propagation_graphs


def get_numpy_array(list_of_list):
    np_array_lists = []
    for list_obj in list_of_list:
        np_array_lists.append(np.array(list_obj))

    return np.array(np_array_lists)


def print_stat_values(feature_name, values, short_feature_name=""):
    print("=========================================")
    print("Feature {} : {}".format(short_feature_name, feature_name))
    print("Min value : {}".format(min(values)))
    print("Max value : {}".format(max(values)))
    print("Mean value : {}".format(np.mean(np.array(values))))
    print("=========================================")


#CONSTRUCT SAMPLE FEATURES

In [24]:
import pickle
import queue
from pathlib import Path

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split


def get_features(news_graphs, micro_features, macro_features):
    temporal_features = get_all_temporal_features(news_graphs, micro_features, macro_features)
    structural_features = get_all_structural_features(news_graphs, micro_features, macro_features)
    linguistic_features = get_all_linguistic_features(news_graphs, micro_features, macro_features)

    sample_features = np.concatenate([temporal_features, structural_features, linguistic_features], axis=1)
    return sample_features


def get_dataset(news_source, load_dataset=False, micro_features=True, macro_features=True):
    if load_dataset:
        sample_features = pickle.load(open("{}_samples_features.pkl".format(news_source), "rb"))
        target_labels = pickle.load(open("{}_target_labels.pkl".format(news_source), "rb"))

    else:
        fake_prop_graph, real_prop_graph = get_nx_propagation_graphs(news_source)
        fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)

        print("fake samples len : {} real samples len : {}".format(len(fake_prop_graph), len(real_prop_graph)))

        fake_news_samples = get_features(fake_prop_graph, micro_features, macro_features)
        real_news_samples = get_features(real_prop_graph, micro_features, macro_features)

        print("Fake feature array ")
        print(fake_news_samples.shape)

        print("real feature array")
        print(real_news_samples.shape)

        sample_features = np.concatenate([fake_news_samples, real_news_samples], axis=0)
        target_labels = np.concatenate([np.ones(len(fake_news_samples)), np.zeros(len(real_news_samples))], axis=0)

        pickle.dump(sample_features, (open("{}_samples_features.pkl".format(news_source), "wb")))
        pickle.dump(target_labels, (open("{}_target_labels.pkl".format(news_source), "wb")))

    return sample_features, target_labels


def get_train_test_split(samples_features, target_labels):
    X_train, X_test, y_train, y_test = train_test_split(samples_features, target_labels, stratify=target_labels,
                                                        test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


def perform_pca(train_data, target_labels):
    pca = PCA(n_components=min(20, len(train_data[0])))
    pca.fit(train_data, target_labels)
    return pca


def get_dataset_file_name(file_dir, news_source, include_micro=True, include_macro=True, include_structural=True,
                          include_temporal=True,
                          include_linguistic=True):
    file_names = [news_source]
    if include_micro:
        file_names.append("micro")

    if include_macro:
        file_names.append("macro")

    if include_structural:
        file_names.append("struct")

    if include_temporal:
        file_names.append("temp")

    if include_linguistic:
        file_names.append("linguistic")

    return "{}/{}.pkl".format(file_dir, "_".join(file_names))


def get_TPNF_dataset(out_dir, news_source, include_micro=True, include_macro=True, include_structural=None,
                     include_temporal=None,
                     include_linguistic=None, time_interval=None, use_cache=False):
    file_name = get_dataset_file_name(out_dir, news_source, include_micro, include_macro, include_structural,
                                      include_temporal, include_linguistic)

    data_file = Path(file_name)

    if use_cache and data_file.is_file():
        return pickle.load(open(file_name, "rb"))

    else:
        fake_sample_features, real_sample_features = get_dataset_feature_array(news_source, include_micro,
                                                                               include_macro, include_structural,
                                                                               include_temporal, include_linguistic,
                                                                               time_interval)

        sample_features = np.concatenate([fake_sample_features, real_sample_features], axis=0)
        pickle.dump(sample_features, open(file_name, "wb"))

        return sample_features


def get_dataset_feature_names(include_micro=True, include_macro=True, include_structural=None,
                              include_temporal=None,
                              include_linguistic=None):
    feature_helpers = []

    if include_structural:
        feature_helpers.append(StructureFeatureHelper())

    if include_temporal:
        feature_helpers.append(TemporalFeatureHelper())

    if include_linguistic:
        feature_helpers.append(LinguisticFeatureHelper())

    feature_names_all = []
    short_feature_names_all = []

    for idx, feature_helper in enumerate(feature_helpers):
        features_names, short_feature_names = feature_helper.get_feature_names(include_micro, include_macro)

        feature_names_all.extend(features_names)
        short_feature_names_all.extend(short_feature_names)

    return feature_names_all, short_feature_names_all


def is_valid_graph(prop_graph: tweet_node, retweet=True, reply=True):
    """ Check if the prop graph has alteast one retweet or reply"""

    for post_node in prop_graph.children:
        if (retweet and len(post_node.reply_children) > 0) or (reply and len(post_node.retweet_children) > 0):
            return True

    return False


def remove_node_by_time(graph: tweet_node, limit_time):
    start_time = get_first_post_time(graph)
    end_time = start_time + limit_time

    q = queue.Queue()

    q.put(graph)

    while q.qsize() != 0:
        node = q.get()

        children = node.children

        retweet_children = set(node.retweet_children)
        reply_children = set(node.reply_children)

        for child in children.copy():

            if child.created_time <= end_time:
                q.put(child)
            else:
                node.children.remove(child)
                try:
                    retweet_children.remove(child)
                except KeyError:  # Element not found in the list
                    pass
                try:
                    reply_children.remove(child)
                except KeyError:  # Element not found in the list
                    pass

        node.retweet_children = list(retweet_children)
        node.reply_children = list(reply_children)

    return graph


def filter_propagation_graphs(graphs, limit_time):
    result_graphs = []

    for prop_graph in graphs:
        filtered_prop_graph = remove_node_by_time(prop_graph, limit_time)
        if is_valid_graph(filtered_prop_graph):
            result_graphs.append(filtered_prop_graph)

    return result_graphs


def get_nx_propagation_graphs(data_folder, news_source):
    fake_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "fake")
    real_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "real")

    return fake_propagation_graphs, real_propagation_graphs


def get_dataset_feature_array(news_source, include_micro=True, include_macro=True, include_structural=None,
                              include_temporal=None,
                              include_linguistic=None, time_interval=None):
    fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/nx_network_data", news_source)

    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)

    if time_interval is not None:
        time_limit = time_interval * 60 * 60

        print("Time limit in seconds : {}".format(time_limit))

        fake_prop_graph = filter_propagation_graphs(fake_prop_graph, time_limit)
        real_prop_graph = filter_propagation_graphs(real_prop_graph, time_limit)

        print("After time based filtering ")
        print("No. of fake samples : {}  No. of real samples: {}".format(len(fake_prop_graph), len(real_prop_graph)))

        fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)

    feature_helpers = []
    feature_group_names = []

    if include_structural:
        feature_helpers.append(StructureFeatureHelper())
        feature_group_names.append("Structural")

    if include_temporal:
        feature_helpers.append(TemporalFeatureHelper())
        feature_group_names.append("Temporal")

    if include_linguistic:
        feature_helpers.append(LinguisticFeatureHelper())
        feature_group_names.append("Linguistic")

    fake_feature_all = []
    real_feature_all = []
    for idx, feature_helper in enumerate(feature_helpers):
        fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=include_micro,
                                                          macro_features=include_macro, news_source=news_source,
                                                          label="fake")
        real_features = feature_helper.get_features_array(real_prop_graph, micro_features=include_micro,
                                                          macro_features=include_macro, news_source=news_source,
                                                          label="real")

        feature_names = feature_helper.get_feature_names(micro_features=include_micro, macro_features=include_macro)
        print(feature_names)
        if fake_features is not None and real_features is not None:
            fake_feature_all.append(fake_features)
            real_feature_all.append(real_features)

            print("Feature group : {}".format(feature_group_names[idx]))
            print(len(fake_features))
            print(len(real_features), flush=True)

    return np.concatenate(fake_feature_all, axis=1), np.concatenate(real_feature_all, axis=1)


def get_dataset_statistics(news_source):
    fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/saved_new_no_filter", news_source)

    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)

    feature_helpers = [StructureFeatureHelper(), TemporalFeatureHelper(), LinguisticFeatureHelper()]
    feature_group_names = ["StructureFeatureHelper", "TemporalFeatureHelper", "LinguisticFeatureHelper"]

    for idx, feature_helper in enumerate(feature_helpers):
        print("Feature group : {}".format(feature_group_names[idx]))

        fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=True,
                                                          macro_features=True, news_source=news_source, label="fake")
        real_features = feature_helper.get_features_array(real_prop_graph, micro_features=True,
                                                          macro_features=True, news_source=news_source, label="real")

        feature_helper.save_blox_plots_for_features(fake_feature_array=fake_features,
                                                    real_feature_array=real_features, micro_features=True,
                                                    macro_features=True,
                                                    save_folder="data/feature_images/{}".format(news_source))

        feature_helper.get_feature_significance_t_tests(fake_features, real_features, micro_features=True,
                                                        macro_features=True)

        # Print the statistics of the dataset
        print("------------Fake------------")
        feature_helper.print_statistics_for_all_features(feature_array=fake_features, prop_graphs=fake_prop_graph,
                                                         micro_features=True, macro_features=True)

        print("------------Real------------")
        feature_helper.print_statistics_for_all_features(feature_array=real_features, prop_graphs=fake_prop_graph,
                                                         micro_features=True, macro_features=True)


if __name__ == "__main__":
    get_dataset_statistics("politifact")
    get_dataset_statistics("gossipcop")


FileNotFoundError: ignored

#BASIC MODEL

In [11]:
import time

import matplotlib
import numpy as np
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

matplotlib.use('agg')
import matplotlib.pyplot as plt


def get_classifier_by_name(classifier_name):
    if classifier_name == "GaussianNB":
        return GaussianNB()
    elif classifier_name == "LogisticRegression":
        return LogisticRegression(solver='lbfgs')
    elif classifier_name == "DecisionTreeClassifier":
        return DecisionTreeClassifier()
    elif classifier_name == "RandomForestClassifier":
        return RandomForestClassifier(n_estimators=50)
    elif classifier_name == "SVM -linear kernel":
        return svm.SVC(kernel='linear')


def train_model(classifier_name, X_train, X_test, y_train, y_test):
    accuracy_values = []
    precision_values = []
    recall_values = []
    f1_score_values = []

    for i in range(5):
        classifier_clone = get_classifier_by_name(classifier_name)
        classifier_clone.fit(X_train, y_train)

        predicted_output = classifier_clone.predict(X_test)
        accuracy, precision, recall, f1_score_val = get_metrics(y_test, predicted_output, one_hot_rep=False)

        accuracy_values.append(accuracy)
        precision_values.append(precision)
        recall_values.append(recall)
        f1_score_values.append(f1_score_val)

    print_metrics(np.mean(accuracy_values), np.mean(precision_values), np.mean(recall_values), np.mean(f1_score_values))


def print_metrics(accuracy, precision, recall, f1_score_val):
    print("Accuracy : {}".format(accuracy))
    print("Precision : {}".format(precision))
    print("Recall : {}".format(recall))
    print("F1 : {}".format(f1_score_val))


def get_metrics(target, logits, one_hot_rep=True):
    """
    Two numpy one hot arrays
    :param target:
    :param logits:
    :return:
    """

    if one_hot_rep:
        label = np.argmax(target, axis=1)
        predict = np.argmax(logits, axis=1)
    else:
        label = target
        predict = logits

    accuracy = accuracy_score(label, predict)

    precision = precision_score(label, predict)
    recall = recall_score(label, predict)
    f1_score_val = f1_score(label, predict)

    return accuracy, precision, recall, f1_score_val


def get_basic_model_results(X_train, X_test, y_train, y_test):
    scaler = preprocessing.StandardScaler().fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    classifiers = [GaussianNB(), LogisticRegression(), DecisionTreeClassifier(),
                   RandomForestClassifier(n_estimators=100),
                   svm.SVC()]
    classifier_names = ["GaussianNB", "LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier",
                        "SVM -linear kernel"]

    for idx in range(len(classifiers)):
        print("======={}=======".format(classifier_names[idx]))
        train_model(classifier_names[idx], X_train, X_test, y_train, y_test)


def get_classificaton_results_tpnf(data_dir, news_source, time_interval, use_cache=False):
    include_micro = True
    include_macro = True

    include_structural = True
    include_temporal = True
    include_linguistic = True

    sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
                                            include_temporal, include_linguistic, time_interval, use_cache=use_cache)

    print("Sample feature array dimensions")
    print(sample_feature_array.shape, flush=True)

    num_samples = int(len(sample_feature_array) / 2)
    target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)

    X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)
    get_basic_model_results(X_train, X_test, y_train, y_test)


def plot_feature_importances(coef, names):
    imp = coef
    imp, names = zip(*sorted(zip(imp, names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)

    plt.savefig('feature_importance.png', bbox_inches='tight')
    plt.show()


def dump_random_forest_feature_importance(data_dir, news_source):
    include_micro = True
    include_macro = True

    include_structural = True
    include_temporal = True
    include_linguistic = True

    sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
                                            include_temporal, include_linguistic, use_cache=True)

    sample_feature_array = sample_feature_array[:, :-1]
    feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural,
                                                                   include_temporal, include_linguistic)

    feature_names = feature_names[:-1]
    short_feature_names = short_feature_names[:-1]
    num_samples = int(len(sample_feature_array) / 2)
    target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)

    X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=100, random_state=0)

    forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    matplotlib.rcParams['figure.figsize'] = 5, 2

    # Plot the feature importances of the forest
    plt.figure()

    plt.bar(range(X_train.shape[1]), importances[indices],
            color="b", yerr=std[indices], align="center")
    plt.xticks(range(X_train.shape[1]), np.array(short_feature_names)[indices], rotation=75, fontsize=9.5)
    plt.xlim([-1, X_train.shape[1]])
    plt.savefig('{}_feature_importance.png'.format(news_source), bbox_inches='tight')

    plt.show()


def get_classificaton_results_tpnf_by_time(news_source: str):
    time_intervals = [3, 6, 12, 24, 36, 48, 60, 72, 84, 96]

    for time_interval in time_intervals:
        print("=============Time Interval : {}  ==========".format(time_interval))
        start_time = time.time()
        get_classificaton_results_tpnf("data/features", news_source, time_interval)

        print("\n\n================Exectuion time - {} ==================================\n".format(
            time.time() - start_time))


if __name__ == "__main__":
    get_classificaton_results_tpnf("data/features", "politifact", time_interval=None, use_cache=False)

    get_classificaton_results_tpnf("data/features", "gossipcop", time_interval=None, use_cache=False)

    # Filter the graphs by time interval (for early fake news detection) and get the classification results
    # get_classificaton_results_tpnf_by_time("politifact")
    # get_classificaton_results_tpnf_by_time("gossipcop")


NameError: ignored