<a href="https://colab.research.google.com/github/andrybrew/IHT-SEM1302-30Okt/blob/main/practice_material/001_social_network_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Social Network Analysis**

##**Importing required libraries**

In [None]:
!pip install 'networkx'

In [None]:
import pandas as pd
import networkx as nx
from community import community_louvain
import matplotlib.pyplot as plt
import re

##**Importing Dataset**

In [None]:
# Fetching the dataset from GitHub
data_url = "https://raw.githubusercontent.com/andrybrew/IHT-SEM1302-30Okt/main/data/001_suku-bunga.csv"

# Using pandas read_csv function to load the data from the URL directly into a DataFrame
df_tweet = pd.read_csv(data_url)

# Show selected columns from the DataFrame
df_tweet[['text', 'username', 'in_reply_to']]

## **Building the Network Graph**

In [None]:
# Create edgelist
edgelist = df_tweet[['username', 'in_reply_to']].rename(columns={'username':'source', 'in_reply_to':'target'}).dropna()

# Remove rows if the target is '[]' or an empty string ''
edgelist = edgelist[~edgelist['target'].isin(['[]', ''])]

# Function to extract all usernames from a string
def extract_mentions(s):
    return re.findall(r'@[\w_]+', s)

# Apply this function to the 'target' column to obtain a list of usernames
edgelist['target'] = edgelist['target'].apply(extract_mentions)

# Separate list entries into individual rows
edgelist = edgelist.explode('target')

# Remove self-loops
edgelist = edgelist[edgelist['source'] != edgelist['target']]

# Display the edgelist
edgelist

In [None]:
# Write edgelist to a CSV file
edgelist.to_csv('edgelist.csv', index=False)

In [None]:
# Create a graph from the edgelist
G = nx.from_pandas_edgelist(edgelist, source='source', target='target')

# Visualize the graph
nx.draw(G, pos = nx.kamada_kawai_layout(G), font_size = 7, with_labels = True)

### **Network Analysis: Core Properties**

In [None]:
# Calculate number of nodes
num_nodes = G.number_of_nodes()

# Calculate number of edges
num_edges = G.number_of_edges()

# Calculate density
density = nx.density(G)

# Find the Largest Connected Component (LCC)
largest_cc = max(nx.connected_components(G), key=len)
subgraph = G.subgraph(largest_cc)

# Calculate metrics for the LCC
lcc_avg_path_length = nx.average_shortest_path_length(subgraph)
lcc_diameter = nx.diameter(subgraph)

# Create a dataframe based on the results
network_properties = pd.DataFrame(data= {'num_nodes':[num_nodes],
                                         'num_edges':[num_edges],
                                         'average_path_length': [lcc_avg_path_length],
                                         'diameter': [lcc_diameter],
                                         'density':[density]})
# Show network properties
network_properties

### **Centrality Measures: Identifying Influencers**

In [None]:
# Calculate degree centrality
degree_centrality = nx.degree_centrality(G)

# Calculate betweeness centrality
betweenness_centrality = nx.betweenness_centrality(G)

# Calculate closeness centrality
closeness_centrality = nx.closeness_centrality(G)

# Calculate eigenvector centrality
eigenvector_centrality = nx.closeness_centrality(G)

# Create a dataframe based on the results
centralities = pd.DataFrame([degree_centrality, betweenness_centrality,
                             closeness_centrality, eigenvector_centrality]).T
centralities.columns = ['degree_centrality', 'betweenness_centrality',
                        'closeness_centrality', 'eigenvector_centrality']

# Show centralities
centralities

### **Community Detection: Analyzing Modularity**

In [None]:
# Calculate the modularity and community detection
partition = community_louvain.best_partition(G)

# Retrieve the community number for each node in the graph
values = [partition.get(node) for node in G.nodes()]

# Display the dictionary containing the node-community mappings
partition

###**Advanced Visualization**

In [None]:
# Visualize the graph
nx.draw_networkx(G, pos = nx.kamada_kawai_layout(G), cmap = plt.get_cmap('jet'), node_color = values, font_size = 7, with_labels= True)

In [None]:
# Convert degree centrality to a list of sizes for nodes
node_sizes = [degree_centrality[node] * 1000 for node in G.nodes()]

# Convert degree centrality to a list of sizes for labels
label_sizes = {node: degree_centrality[node] * 150 for node in G.nodes()}

# Draw the network with node sizes based on degree centrality
pos = nx.fruchterman_reingold_layout(G)
nx.draw(G, pos, with_labels=False, node_size=node_sizes, cmap = plt.get_cmap('jet'), node_color = values)

# Add labels with sizes based on degree centrality
for node, (x, y) in pos.items():
    plt.text(x, y, node, fontsize=label_sizes[node])

plt.figure(figsize=(30, 30))
plt.show()