## **Importing the Necessary modules**

In [None]:
import pandas as pd                                     # Importing pandas library for reading the dataset file
import numpy as np                                      # Importing the numpy module for
import sklearn                                          # Importing the sklearn module for model selction
import seaborn as sns                                   # Importing the seaborn module for data visualization
import matplotlib.pyplot as plt                         # importing the matplotlib module for graph plotting
import networkx as nx                                   # importing the networkx module for graph or node plotting
from itertools import combinations                      # importing the combinations module from itertools for node community
import random                                           # Importing Random module for randomization
from multiprocessing import Pool
from networkx.algorithms.community import greedy_modularity_communities
from community import community_louvain                 # Community detection using Louvain algorithm


pd.set_option("display.max_columns", None)              # Method to set max display columns and rows of the dataset.
pd.set_option("display.max_rows", 200)

## **Loading the social network dataset file from open source**

In [None]:
data = pd.read_csv('/content/SocialMediaUsersDataset.csv')      # Load the data into a pandas DataFrame and reading the file
data.head()

In [None]:
data.shape                # Checking the shape of dataset

#### **Observations**

Since our dataset has huge records or nodes which will take very much long time to execute and create the anlysis. So as per our need we required 1k to 1.5k nodes atleast. so we sampled 2k nodes from our dataset to make our analysis smooth, so below we did it.

## **1. Network Construction Process**



#### **1.1 Making nodes upto 2k**

In [None]:
df = data.sample(n=2000, random_state=42)    # Now df contains a random sample of 2000 Nodes or records
df.head()

In [None]:
df.shape                # Checking the shape of new 2k nodes records dataset

#### **1.2 Determing Nodes and Edges Relationship**

In [None]:
G = nx.Graph()                                                             # Create an empty graph
for index, row in df.iterrows():
    G.add_node(row['UserID'], Gender=row['Gender'],
    DOB=row['DOB'], Interests=row['Interests'], City=row['City'],
    Country=row['Country'])                                                 # Add nodes with attributes
for (user1, data1), (user2, data2) in combinations(G.nodes(data=True), 2):  # Add edges based on shared interests (simplified)
    interests1 = set(data1['Interests'].split(', '))
    interests2 = set(data2['Interests'].split(', '))
    if interests1 & interests2:
        G.add_edge(user1, user2, weight=len(interests1 & interests2))
print("Summary of Network:")                                                 # Summarize key information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")



##### **Observations**

So total 2k nodes in our sample network and total 528850 edges connetced with nodes. It shows that definitely each nodes as connected with multiple nodes to eachother

#### **1.3 Additional adding Edge weights**

In [None]:
node_attributes = nx.get_node_attributes(G, 'Gender')                          # Additional progress: Collecting node attributes and edge weights
edge_weights = nx.get_edge_attributes(G, 'weight')
if edge_weights:                                                               # Calculate average edge weight
    avg_edge_weight = sum(edge_weights.values()) / len(edge_weights)
    print(f"Average edge weight: {avg_edge_weight:.2f}")
plt.figure(figsize=(10, 7))                                                      # Basic visualization
pos = nx.spring_layout(G, seed=42)                                             # Seed for reproducibility
nx.draw(G, pos, node_size=20, edge_color='lightgreen', with_labels=False)
plt.title('Network Visualization')
plt.show()

## **2. Network Analysis Process**



#### **2.1 Degree distribution analysis**

In [None]:
degrees = [G.degree(n) for n in G.nodes()]                      # Degree Distribution
plt.hist(degrees)
plt.title('Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Number of Nodes')
plt.show()


In [None]:
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)              # Degree distribution plot
degree_count = pd.Series(degree_sequence).value_counts().sort_index()
plt.figure(figsize=(12, 7))
plt.bar(degree_count.index, degree_count.values)
plt.xlabel('Degree')
plt.ylabel('Number of Nodes')
plt.title('Node Degree Distribution')
plt.show()

#### **2.2 Connected components analysis**

In [None]:
connected_components = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]     # Connected Components Analysis
print("Number of Connected Components:", len(connected_components))


#### **2.3 Path analysis**

In [None]:
nodes = list(G.nodes())                                                  # Path Analysis - Example: shortest path between two randomly chosen nodes
source, target = random.sample(nodes, 2)
try:
    shortest_path = nx.shortest_path(G, source=source, target=target)
    print(f"Shortest path from {source} to {target}: {shortest_path}")
except nx.NetworkXNoPath:
    print(f"No path exists between {source} and {target}.")


In [None]:
source_node = list(G.nodes())[0]                                                 # Path Analysis Visualization
target_node = list(G.nodes())[1]
shortest_path = nx.shortest_path(G, source=source_node, target=target_node)      # Find shortest path and visualize it
plt.figure(figsize=(12, 7))                                                        # Highlight the shortest path in the network visualization
nx.draw(G, pos, node_size=20, edge_color='lightgreen', with_labels=False)
nx.draw_networkx_nodes(G, pos, nodelist=shortest_path, node_color='darkblue', node_size=20)
nx.draw_networkx_edges(G, pos, edgelist=[(shortest_path[i], shortest_path[i+1]) for i in range(len(shortest_path)-1)],
                       edge_color='red', width=2)
plt.title('Shortest Path Visualization')
plt.show()

#### **2.4 Clustering Coefficient and Density analysis**

In [None]:
avg_clustering = nx.average_clustering(G)                         # Clustering Coefficient
print(f"Average clustering coefficient: {avg_clustering}")

In [None]:
density = nx.density(G)                                           # Network Density
print(f"Network density: {density}")

#### **2.5 Centrality analysis**
Statistics compared with those of

(i) ER,

(ii) BA, and

(iii) WS

graphs having a similar number of nodes and edges**

In [None]:
degree_centrality = nx.degree_centrality(G)                                         # Degree Centrality
top_10 = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10] # Get the top 5 nodes by degree centrality
print("Top 10 nodes by degree centrality:", top_10)

In [None]:
def average_centrality(centrality):                                         # Print average centrality measures
    return np.mean(list(centrality.values()))


In [None]:
degree_centrality = nx.degree_centrality(G)                                 # Calculate centrality measures for the original graph
closeness_centrality = nx.closeness_centrality(G)


In [None]:
print("Original Graph:")                                                        # Print average centrality measures for the original graph
print("Average Degree Centrality:", average_centrality(degree_centrality))
print("Average Closeness Centrality:", average_centrality(closeness_centrality))


##### **(i) ER Comparision**


In [None]:
n = len(G.nodes())                                  # Erdős-Rényi graph
m = len(G.edges())
p = 2*m / (n*(n-1))                                 # Probability for edge creation
G_er = nx.erdos_renyi_graph(n, p)


In [None]:
degree_centrality_er = nx.degree_centrality(G_er)                # Calculate centrality measures for ER graph
closeness_centrality_er = nx.closeness_centrality(G_er)


In [None]:
print("Erdős-Rényi Graph:")                                      # Print average centrality measures for ER graph
print("Average Degree Centrality:", average_centrality(degree_centrality_er))
print("Average Closeness Centrality:", average_centrality(closeness_centrality_er))


##### **(i) BA  Comparision**


In [None]:
G_ba = nx.barabasi_albert_graph(n, int(m/n))                       # Barabási-Albert graph Using m parameter from the original graph


In [None]:
degree_centrality_ba = nx.degree_centrality(G_ba)                  # Calculate centrality measures for BA graph
closeness_centrality_ba = nx.closeness_centrality(G_ba)


In [None]:
print("Barabási-Albert Graph:")                                                    # Print average centrality measures for BA graph
print("Average Degree Centrality:", average_centrality(degree_centrality_ba))
print("Average Closeness Centrality:", average_centrality(closeness_centrality_ba))


##### **(i) WS Comparision**


In [None]:
k = int(2*m/n)                                       # Each node is connected to k nearest neighbors in ring topology
p_ws = 0.1                                           # Rewiring probability
G_ws = nx.watts_strogatz_graph(n, k, p_ws)


In [None]:
degree_centrality_ws = nx.degree_centrality(G_ws)         # Calculate centrality measures for WS graph
closeness_centrality_ws = nx.closeness_centrality(G_ws)


In [None]:
print("Watts-Strogatz Graph:")                                                  # Print average centrality measures for WS graph
print("Average Degree Centrality:", average_centrality(degree_centrality_ws))
print("Average Closeness Centrality:", average_centrality(closeness_centrality_ws))


## **3. Open Question**


#### **3.1 Our Question based on our dataset analysis**

**Question :**  How do shared interests influence the clustering of users within specific geographical locations?.


##### **Solution**

##### **Step 1 :**  We should Identify subgraphs for users within the same city or country. So below we have perform this using python code and visualize the graph

In [None]:
communities = greedy_modularity_communities(G)                                # Use modularity to find communities
print(f"Number of communities: {len(communities)}")
largest_community = max(communities, key=len)                                 # Optionally, visualize the largest community
subgraph = G.subgraph(largest_community)
pos = nx.spring_layout(subgraph)
nx.draw(subgraph, pos, node_size=10, edge_color="lightgreen", node_color="blue", with_labels=False)
plt.show()

##### **Step 2 :**  We should Compute and compare the clustering coefficients and density within these subgraphs. So below we have perform this using python code and did it.

In [None]:
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)                   # Degree distribution plot
degree_count = pd.Series(degree_sequence).value_counts().sort_index()

plt.figure(figsize=(10, 6))
plt.bar(degree_count.index, degree_count.values)
plt.xlabel('Degree')
plt.ylabel('Number of Nodes')
plt.title('Node Degree Distribution')
plt.show()

In [None]:
clustering_coeffs = nx.clustering(G)                                      # Clustering coefficient distribution
plt.figure(figsize=(10, 6))
plt.hist(list(clustering_coeffs.values()))
plt.xlabel('Clustering Coefficient')
plt.ylabel('Number of Nodes')
plt.title('Clustering Coefficient Distribution')
plt.show()

##### **Step 3 :**  We should Identify any noticeable patterns in how users with shared interests cluster geographically. So below we have perform this using python code and visualize with connectivity and did it.

In [None]:
partition = community_louvain.best_partition(G)               # Compute the best partition
for node, community_id in partition.items():                  # Add community information to nodes
    G.nodes[node]['Community'] = community_id


In [None]:
community_id_to_visualize = 0                                       # Visualize a specific community network and Adjust as needed
community_nodes = [node for node, data in G.nodes
 (data=True) if data['Community'] == community_id_to_visualize]     # Extract nodes and edges for the community of interest
community_edges = [(u, v) for u, v in G.edges() if u in community_nodes and v in community_nodes]
community_graph = nx.Graph()                                        # Create a subgraph for the community
community_graph.add_nodes_from(community_nodes)
community_graph.add_edges_from(community_edges)
plt.figure(figsize=(10, 8))                                           # Draw the community network
pos = nx.spring_layout(community_graph)
nx.draw(community_graph, pos, with_labels=True, node_color='skyblue', node_size=200, edge_color='gray', linewidths=0.5)
plt.title(f'Community {community_id_to_visualize} Network')
plt.show()