<a href="https://colab.research.google.com/github/YuxuanHuang455/PS-Microexpression/blob/main/Explanation_NLP_network_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part I Query Literature:


Retrieve scholarly articles related to "blockchain" and "machine learning" or "web3" and "AI," and store their metadata in a pandas DataFrame.

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time

def fetch_arxiv_data(query, max_results=100):
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = f'search_query=all:{query}&start=0&max_results={max_results}'
    response = requests.get(base_url + search_query)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch data: {response.status_code}")

def parse_arxiv_response(response):
    import xml.etree.ElementTree as ET
    root = ET.fromstring(response)
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    entries = []
    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text
        abstract = entry.find('atom:summary', ns).text
        published = entry.find('atom:published', ns).text
        year = published.split('-')[0]
        journal_ref = entry.find('atom:journal_ref', ns)
        venue = journal_ref.text if journal_ref is not None else 'N/A'
        entries.append({
            'title': title,
            'abstract': abstract,
            'year': int(year),
            'venue': venue
        })
    return entries

queries = ['Microexpression', 'prediction']

all_entries = []

for query in queries:
    print(f"Fetching data for query: {query}")
    response = fetch_arxiv_data(query)
    entries = parse_arxiv_response(response)
    all_entries.extend(entries)
    time.sleep(3)  # To respect arXiv's rate limits

df = pd.DataFrame(all_entries)
df.to_csv('literature_data.csv', index=False)
print("Data saved to literature_data.csv")
df.head()

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time

def fetch_arxiv_data(query, max_results=100):
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = f'search_query=all:{query}&start=0&max_results={max_results}'
    response = requests.get(base_url + search_query)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch data: {response.status_code}")

def parse_arxiv_response(response):
    import xml.etree.ElementTree as ET
    root = ET.fromstring(response)
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    entries = []
    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text
        abstract = entry.find('atom:summary', ns).text
        published = entry.find('atom:published', ns).text
        year = published.split('-')[0]
        journal_ref = entry.find('atom:journal_ref', ns)
        venue = journal_ref.text if journal_ref is not None else 'N/A'
        entries.append({
            'title': title,
            'abstract': abstract,
            'year': int(year),
            'venue': venue
        })
    return entries

# Interactive input for query
query = input("Enter your query for arXiv search (e.g., blockchain AND sustainability): ")

print(f"Fetching data for query: {query}")
response = fetch_arxiv_data(query)
entries = parse_arxiv_response(response)

# Create a DataFrame and save to CSV
df = pd.DataFrame(entries)
df.to_csv('literature_data.csv', index=False)
print("Data saved to literature_data.csv")
print("Here are the first few results:")
print(df.head())


# Part II Natural Language Processing (NLP) Analysis

Perform analyses such as word cloud generation and sentiment analysis on the collected abstracts.

## a. Word Cloud Generation
Visualize the most frequent words in the abstracts.

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all abstracts into a single string
text = ' '.join(df['abstract'].dropna().tolist())

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Abstracts')
plt.show()


## b. Sentiment Analysis
Analyze the sentiment of each abstract using the TextBlob library.

In [None]:
# Import necessary libraries
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt

# Improved sentiment analysis function
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Example DataFrame with abstracts (Replace with your data)
data = {'abstract': ["This is amazing!", "I don't like this.", "It's okay.", None]}
df = pd.DataFrame(data)

# Apply sentiment analysis to the abstracts
df['sentiment_category'] = df['abstract'].dropna().apply(analyze_sentiment)

# Calculate sentiment polarity scores for histogram
df['sentiment_polarity'] = df['abstract'].dropna().apply(lambda x: TextBlob(x).sentiment.polarity)

# Display the results
print(df[['abstract', 'sentiment_category', 'sentiment_polarity']])

# Plot the sentiment distribution
plt.figure(figsize=(10, 5))
df['sentiment_polarity'].hist(bins=20, edgecolor='black', color='lightblue')
plt.title('Sentiment Polarity Distribution of Abstracts')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()

# Plot sentiment category distribution
plt.figure(figsize=(7, 5))
df['sentiment_category'].value_counts().plot(kind='bar', color=['green', 'gray', 'red'], edgecolor='black')
plt.title('Sentiment Category Distribution of Abstracts')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()


## c. Network Visualization
Visualize relationships between key terms in the abstracts using NetworkX and matplotlib.

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time

def fetch_arxiv_data(query, max_results=100):
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = f'search_query=all:{query}&start=0&max_results={max_results}'
    response = requests.get(base_url + search_query)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch data: {response.status_code}")

def parse_arxiv_response(response):
    import xml.etree.ElementTree as ET
    root = ET.fromstring(response)
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    entries = []
    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text
        abstract = entry.find('atom:summary', ns).text
        published = entry.find('atom:published', ns).text
        year = published.split('-')[0]
        journal_ref = entry.find('atom:journal_ref', ns)
        venue = journal_ref.text if journal_ref is not None else 'N/A'
        entries.append({
            'title': title,
            'abstract': abstract,
            'year': int(year),
            'venue': venue
        })
    return entries

# Interactive input for query
query = input("Enter your query for arXiv search (e.g., blockchain AND sustainability): ")

print(f"Fetching data for query: {query}")
response = fetch_arxiv_data(query)
entries = parse_arxiv_response(response)

# Create a DataFrame and save to CSV
df = pd.DataFrame(entries)
df.to_csv('literature_data.csv', index=False)
print("Data saved to literature_data.csv")
print("Here are the first few results:")
print(df.head())


In [None]:
print(df['abstract'].head())  # Ensure this column exists and has data
print(df['abstract'].isna().sum())  # Check for NaN values

In [None]:
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
from networkx.algorithms import community

# Tokenize and create a co-occurrence matrix
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['abstract'].dropna())
Xc = (X.T * X)  # Co-occurrence matrix
Xc.setdiag(0)  # Set diagonal to zero

# Create graph from co-occurrence matrix
G = nx.from_scipy_sparse_array(Xc)

# Map indices to words
terms = vectorizer.get_feature_names_out()
mapping = {i: terms[i] for i in range(len(terms))}
G = nx.relabel_nodes(G, mapping)

# Filter edges by weight (co-occurrence count)
threshold = 100  # Adjust this threshold based on your dataset
edges = [(u, v, d) for u, v, d in G.edges(data=True) if d['weight'] > threshold]
H = nx.Graph()
H.add_edges_from(edges)

# Compute node degrees
degrees = dict(H.degree())

# Detect communities
communities = community.greedy_modularity_communities(H)
community_map = {}
for i, com in enumerate(communities):
    for name in com:
        community_map[name] = i

# Assign colors to communities
colors = [community_map[node] for node in H.nodes()]

# Draw the network
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)  # Seed for reproducibility

# Draw nodes with sizes proportional to degree
nx.draw_networkx_nodes(H, pos,
                       node_size=[degrees[node] * 10 for node in H.nodes()],
                       node_color=colors,
                       cmap=plt.cm.tab20,
                       alpha=0.7)

# Draw edges with widths proportional to weight
nx.draw_networkx_edges(H, pos,
                       width=[d['weight'] * 0.01 for (u, v, d) in H.edges(data=True)],
                       alpha=0.5)

# Draw node labels
nx.draw_networkx_labels(H, pos, font_size=12, font_color='black')

# Draw edge labels
edge_labels = {(u, v): d['weight'] for u, v, d in H.edges(data=True)}
nx.draw_networkx_edge_labels(H, pos, edge_labels=edge_labels, font_size=8)

plt.title('Enhanced Co-occurrence Network of Terms in Abstracts')
plt.axis('off')
plt.show()


Network Visualization:

- **Co-occurrence Network Construction**: We employed the CountVectorizer from scikit-learn to tokenize the abstracts and create a term-document matrix. By computing the co-occurrence matrix, we identified pairs of terms that
- **Graph Creation and Enhancement:** Using NetworkX, we constructed a graph where nodes represented terms, and edges indicated co-occurrence relationships. To enhance the visualization:

 - **Node Sizing**: Nodes were sized proportionally to their degree, emphasizing terms with more connections.
 - **Community Detection**: We applied the greedy modularity algorithm to detect communities within the network, assigning distinct colors to each cluster to highlight related groups of terms.
 - **Edge Weighting**: Edge widths were adjusted based on co-occurrence frequency, underscoring stronger associations between terms.



### Centrality Measures: Identify the most influential terms in the network using centrality measures.

In [None]:
# Calculate different centrality measures
degree_centrality = nx.degree_centrality(H)
betweenness_centrality = nx.betweenness_centrality(H)
closeness_centrality = nx.closeness_centrality(H)

# Highlight top terms based on degree centrality
top_terms = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top terms by degree centrality:", [term for term, centrality in top_terms])

# Draw the network with node size proportional to degree centrality
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos,
                       node_size=[degree_centrality[node] * 2000 for node in H.nodes()],
                       node_color='skyblue',
                       alpha=0.7)
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Network with Node Size Proportional to Degree Centrality')
plt.axis('off')
plt.show()


### Clustering Coefficient Analysis: Investigate the local clustering of terms.

In [None]:
# Calculate the clustering coefficient for each node
clustering_coefficients = nx.clustering(H)

# Highlight nodes with high clustering coefficients
high_clustering_nodes = [node for node, coeff in clustering_coefficients.items() if coeff > 0.5]
print("Nodes with high clustering coefficients:", high_clustering_nodes)

# Draw the network with nodes colored by clustering coefficient
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos,
                       node_size=100,
                       node_color=[clustering_coefficients[node] for node in H.nodes()],
                       cmap=plt.cm.viridis,
                       alpha=0.7)
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Network Colored by Clustering Coefficient')
plt.axis('off')
plt.show()


## Eigenvector Centrality: Highlight nodes based on their influence in the network.

In [None]:
# Calculate eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(H)

# Highlight top terms by eigenvector centrality
top_eigenvector_terms = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top terms by eigenvector centrality:", [term for term, centrality in top_eigenvector_terms])

# Draw the network with node size proportional to eigenvector centrality
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos,
                       node_size=[eigenvector_centrality[node] * 2000 for node in H.nodes()],
                       node_color='lightgreen',
                       alpha=0.7)
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Network with Node Size Proportional to Eigenvector Centrality')
plt.axis('off')
plt.show()


## Core-Periphery Analysis: Identify core and peripheral nodes in the network.

In [None]:
# Perform a core-periphery analysis
core_number = nx.core_number(H)

# Highlight core nodes
max_core = max(core_number.values())
core_nodes = [node for node, core in core_number.items() if core == max_core]
print("Core nodes:", core_nodes)

# Draw the network with core nodes highlighted
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos,
                       nodelist=core_nodes,
                       node_size=300,
                       node_color='red',
                       alpha=0.7,
                       label='Core Nodes')
nx.draw_networkx_nodes(H, pos,
                       nodelist=[node for node in H.nodes() if node not in core_nodes],
                       node_size=100,
                       node_color='grey',
                       alpha=0.7,
                       label='Peripheral Nodes')
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Core-Periphery Structure in the Network')
plt.legend()
plt.axis('off')
plt.show()


## Hierarchical Community Detection: Use hierarchical clustering to detect communities.

In [None]:
# Choose two key nodes based on centrality or domain knowledge
node1, node2 = top_terms[0][0], top_terms[1][0]  # Example using top terms by degree centrality

# Find the shortest path between the selected nodes
shortest_path = nx.shortest_path(H, source=node1, target=node2)
print(f"Shortest path between {node1} and {node2}:", shortest_path)

# Draw the network highlighting the shortest path
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos, node_size=100, alpha=0.7)
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_edges(H, pos, edgelist=[(shortest_path[i], shortest_path[i + 1]) for i in range(len(shortest_path) - 1)],
                       edge_color='red', width=2, alpha=0.8, label='Shortest Path')
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Network Highlighting the Shortest Path')
plt.legend()
plt.axis('off')
plt.show()


## Shortest Path Analysis: Visualize shortest paths between key nodes.

In [None]:
# Choose two key nodes based on centrality or domain knowledge
node1, node2 = top_terms[0][0], top_terms[1][0]  # Example using top terms by degree centrality

# Find the shortest path between the selected nodes
shortest_path = nx.shortest_path(H, source=node1, target=node2)
print(f"Shortest path between {node1} and {node2}:", shortest_path)

# Draw the network highlighting the shortest path
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(H, k=0.15, seed=42)
nx.draw_networkx_nodes(H, pos, node_size=100, alpha=0.7)
nx.draw_networkx_edges(H, pos, alpha=0.5)
nx.draw_networkx_edges(H, pos, edgelist=[(shortest_path[i], shortest_path[i + 1]) for i in range(len(shortest_path) - 1)],
                       edge_color='red', width=2, alpha=0.8, label='Shortest Path')
nx.draw_networkx_labels(H, pos, font_size=10)
plt.title('Network Highlighting the Shortest Path')
plt.legend()
plt.axis('off')
plt.show()


# Part III: More on NLP Analysis

In [None]:
!pip install pandas transformers sentence-transformers scikit-learn matplotlib


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the abstracts
embeddings = model.encode(df['abstract'].tolist())

# Number of clusters for topic modeling
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(embeddings)
df['topic'] = kmeans.labels_

# Display the topics
for i in range(1,num_clusters):
    print(f"Topic {i}:")
    print(df[df['topic'] == i]['abstract'].tolist())
    print()


In [None]:
!pip install tqdm


In [None]:
from transformers import pipeline
from tqdm import tqdm

# Initialize the summarization pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

# Only summarize the first 5 abstracts
df_test = df.head(5)

# Apply summarization with a progress bar
summaries = []
for abstract in tqdm(df_test['abstract'], desc="Summarizing Abstracts (Test)"):
    summary = summarizer(abstract, max_length=50, min_length=25, do_sample=False)[0]['summary_text']
    summaries.append(summary)

# Add summaries to the test DataFrame
df_test['summary'] = summaries

# Display original abstracts and their summaries
for idx, row in df_test.iterrows():
    print(f"Original Abstract: {row['abstract']}")
    print(f"Summary: {row['summary']}")
    print()
