In [4]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from community import community_louvain
import os

In [6]:
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=['Name', 'Country', 'Subscribers', 'Category_2'])

    for col in ['Subscribers', 'Avg. views', 'Avg. likes', 'Avg Comments']:
        df[col] = df[col].astype(str).str.replace(',', '')
        df[col] = df[col].apply(lambda x: float(x[:-1]) * 1000 if x.endswith('K') else
                                float(x[:-1]) * 1_000_000 if x.endswith('M') else
                                float(x) if x else 0)

    df.fillna({'Avg Comments': 0, 'Avg. likes': 0, 'Avg. views': 0}, inplace=True)
    df['Engagement Rate'] = df['Avg. likes'] / df['Avg. views']
    df['Comment Rate'] = df['Avg Comments'] / df['Avg. views']
    return df

def detect_anomalies(df, z_threshold=3):
    df['Engagement_Z'] = (df['Engagement Rate'] - df['Engagement Rate'].mean()) / df['Engagement Rate'].std()
    df['Comment_Z'] = (df['Comment Rate'] - df['Comment Rate'].mean()) / df['Comment Rate'].std()
    df['Anomaly'] = ((df['Engagement_Z'].abs() > z_threshold) | 
                     (df['Comment_Z'].abs() > z_threshold)).astype(int)
    suspicious = df[df['Anomaly'] == 1]
    print(f"\n Detected {len(suspicious)} suspicious influencers (potential fake engagement):")
    print(suspicious[['Name', 'Engagement Rate', 'Comment Rate', 'Engagement_Z', 'Comment_Z']])
    return df

def build_graph(df):
    G = nx.Graph()
    CATEGORY_WEIGHT = 0.5
    COUNTRY_WEIGHT = 0.05
    SUBSCRIBER_WEIGHT = 0.15
    ENGAGEMENT_WEIGHT = 0.3

    for _, row in df.iterrows():
        G.add_node(row['Name'], subscribers=row['Subscribers'], category=row['Category_2'],
                   country=row['Country'], views=row['Avg. views'], likes=row['Avg. likes'],
                   comments=row['Avg Comments'], engagement_rate=row['Engagement Rate'],
                   comment_rate=row['Comment Rate'])

    for i, row1 in df.iterrows():
        for j, row2 in df.iterrows():
            if i < j:
                similarity = 0

                if row1['Category_2'] == row2['Category_2']:
                    similarity += CATEGORY_WEIGHT
                if row1['Country'] == row2['Country']:
                    similarity += COUNTRY_WEIGHT
                if abs(np.log10(row1['Subscribers'] + 1) - np.log10(row2['Subscribers'] + 1)) < 1:
                    similarity += SUBSCRIBER_WEIGHT
                if abs(row1['Engagement Rate'] - row2['Engagement Rate']) < 0.1:
                    similarity += ENGAGEMENT_WEIGHT

                if similarity > 0.2:
                    G.add_edge(row1['Name'], row2['Name'], weight=similarity)
    return G

def calculate_network_metrics(G):
    return {
        'degree': nx.degree_centrality(G),
        'betweenness': nx.betweenness_centrality(G),
        'closeness': nx.closeness_centrality(G),
        'eigenvector': nx.eigenvector_centrality_numpy(G),
        'pagerank': nx.pagerank(G),
        'community': community_louvain.best_partition(G)
    }

def find_similar_influencers(G, metrics, influencer_name, top_n=8):
    if influencer_name not in G.nodes():
        print("Influencer not found.")
        return None, None

    target = {k: metrics[k][influencer_name] for k in metrics if k != 'community'}
    target['community'] = metrics['community'][influencer_name]

    scores = {}
    for node in G.nodes():
        if node == influencer_name:
            continue
        dist = np.sqrt(sum((metrics[k][node] - target[k])**2 for k in target if k != 'community'))
        community_sim = 1 if metrics['community'][node] == target['community'] else 0
        connection_bonus = 0.5 if G.has_edge(influencer_name, node) else 0
        scores[node] = -dist + community_sim + connection_bonus

    top_matches = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    subgraph = G.subgraph([influencer_name] + [x[0] for x in top_matches])
    return top_matches, subgraph

def create_metrics_table(G, metrics, similar_influencers, target_influencer):
    influencers = [target_influencer] + [inf[0] for inf in similar_influencers]
    return pd.DataFrame({
        'Influencer': influencers,
        'Degree Centrality': [metrics['degree'][inf] for inf in influencers],
        'Betweenness Centrality': [metrics['betweenness'][inf] for inf in influencers],
        'Closeness Centrality': [metrics['closeness'][inf] for inf in influencers],
        'Eigenvector Centrality': [metrics['eigenvector'][inf] for inf in influencers],
        'PageRank': [metrics['pagerank'][inf] for inf in influencers],
        'Community': [metrics['community'][inf] for inf in influencers]
    })

def visualize_network(G, subgraph, target_influencer, metrics, similar_influencers):
    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(subgraph, seed=42)
    communities = [metrics['community'][node] for node in subgraph.nodes()]
    unique_communities = sorted(set(communities))
    color_map = {comm: i for i, comm in enumerate(unique_communities)}
    node_colors = [color_map[comm] for comm in communities]
    nx.draw_networkx_nodes(subgraph, pos, node_color=node_colors,
                           node_size=[300 if node == target_influencer else 200 for node in subgraph.nodes()],
                           alpha=0.8, cmap=plt.cm.tab10)
    edge_weights = [subgraph[u][v].get('weight', 0.1) * 2 for u, v in subgraph.edges()]
    nx.draw_networkx_edges(subgraph, pos, width=edge_weights, alpha=0.5)
    nx.draw_networkx_labels(subgraph, pos, font_size=10, font_weight='bold')
    plt.title(f"Network of Similar Influencers to {target_influencer}", fontsize=15)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('influencer_network.png', dpi=300)
    plt.show()

def visualize_full_network(G, metrics):
    plt.figure(figsize=(20, 16))
    pos = nx.spring_layout(G, k=0.15, seed=42)
    communities = [metrics['community'][node] for node in G.nodes()]
    unique_communities = sorted(set(communities))
    color_map = {comm: i for i, comm in enumerate(unique_communities)}
    node_colors = [color_map[comm] for comm in communities]
    subscriber_counts = [G.nodes[node].get('subscribers', 10000) for node in G.nodes()]
    min_size, max_size = 20, 300
    if max(subscriber_counts) > min(subscriber_counts):
        normalized_sizes = [
            min_size + (s - min(subscriber_counts)) * (max_size - min_size) /
            (max(subscriber_counts) - min(subscriber_counts))
            for s in subscriber_counts
        ]
    else:
        normalized_sizes = [50] * len(subscriber_counts)
        
    nx.draw_networkx_nodes(G, pos, node_color=node_colors,
                           node_size=normalized_sizes, alpha=0.7, cmap=plt.cm.tab10)
    nx.draw_networkx_edges(G, pos, width=0.5, alpha=0.2)
    plt.title("Full Network of YouTube Influencers", fontsize=20)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('full_influencer_network.png', dpi=300)
    plt.show()
    print("Full network saved as 'full_influencer_network.png'")

In [None]:
def main():
    file_path = r'C:\Users\ashle\OneDrive\바탕 화면\MMA\INSY670 Social Media\Group Assignment\social media influencers - Youtube sep-2022 (2).csv'

    df = load_and_clean_data(file_path)
    df = detect_anomalies(df)
    df_clean = df[df['Anomaly'] == 0]

    # Store suspicious names
    suspicious_names = set(df[df['Anomaly'] == 1]['Name'])

    G = build_graph(df_clean)
    metrics = calculate_network_metrics(G)

    # Input loop
    while True:
        target = input("\nEnter the name of the influencer you want recommendations for: ").strip()

        if target in suspicious_names:
            print("This influencer has been flagged for suspicious engagement. Please choose another.")
            continue
        elif target not in G.nodes():
            print("Influencer not found. Please enter a valid name from the cleaned network.")
            continue
        else:
            break

    similar_influencers, subgraph = find_similar_influencers(G, metrics, target)

    if similar_influencers:
        print(f"\nTop recommended collaboration partners for {target}:")
        for i, (name, score) in enumerate(similar_influencers, 1):
            print(f"{i}. {name} (Score: {score:.4f})")

        table = create_metrics_table(G, metrics, similar_influencers, target)
        print("\nNetwork Metrics:")
        print(table.to_string(index=False))

        visualize_network(G, subgraph, target, metrics, similar_influencers)
        table.to_csv('influencer_metrics.csv', index=False)

    visualize_full_network(G, metrics)
    print("\n Done!")

if __name__ == "__main__":
    main()


 Detected 9 suspicious influencers (potential fake engagement):
                      Name  Engagement Rate  Comment Rate  Engagement_Z  \
13               BANGTANTV         0.190590      0.004051      3.821278   
14             HYBE LABELS         0.173100      0.004500      3.356755   
41        whinderssonnunes         0.167569      0.004560      3.209856   
60         Kimberly Loaiza         0.194850      0.002050      3.934430   
65             CarryMinati         0.178138      0.006919      3.490554   
411       Pop Chartbusters         0.273713      0.001003      6.029005   
653               Piuzinho         0.242540      0.004591      5.201069   
676   Bispo Bruno Leonardo         0.353650      0.274343      8.152108   
780  Pastor Antônio Júnior         0.227563      0.021600      4.803283   

     Comment_Z  
13    0.122613  
14    0.162055  
41    0.167303  
60   -0.053299  
65    0.374687  
411  -0.145355  
653   0.170025  
676  23.881101  
780   1.665158  


In [1]:
# Number of nodes and edges
print(f"Graph Summary:")
print(f"Total nodes (influencers): {G.number_of_nodes()}")
print(f"Total edges (connections): {G.number_of_edges()}")

# Community detection (Louvain already in your pipeline)
num_communities = len(set(metrics['community'].values()))
print(f"Total Louvain communities detected: {num_communities}")

Graph Summary:


NameError: name 'G' is not defined

In [None]:
# List of T-Series recommendations from previous run
recommended_names = ['Lady Diana', 'Niana Guerrero', 'Rans Entertainment',
                     'SonyMusicIndiaVEVO', 'gymvirtual', 'Speed Records',
                     'Dan‑Sa / Daniel Saboya', 'KAROL G']

# Combine with T-Series
target_and_recs = ['T-Series'] + recommended_names

# Filter engagement rates
engagement_subset = df[df['Name'].isin(target_and_recs)][['Name', 'Engagement Rate']]

# Print with formatting
print("\n Engagement Rates of T-Series & Top Recommendations:")
print(engagement_subset.sort_values(by='Engagement Rate', ascending=False).to_string(index=False))

mean_engagement = engagement_subset['Engagement Rate'].mean()
print(f"\n Average Engagement Rate: {mean_engagement:.4f}")

In [None]:
import networkx as nx

# Average clustering coefficient
avg_clustering = nx.average_clustering(G)
print(f"\n Average Clustering Coefficient: {avg_clustering:.4f}")

In [None]:
from collections import Counter

community_counts = Counter(metrics['community'].values())
print("\n Top 5 Louvain Communities by Size:")
for comm_id, count in community_counts.most_common(5):
    print(f"Community {comm_id}: {count} members")

### Components of the Similarity Score
In the find_similar_influencers function, the similarity score between the target influencer and other influencers is calculated using three main components:

Network Metrics Distance
This calculates the Euclidean distance between the network metrics of two influencers:
Degree Centrality: Measures how many direct connections an influencer has. Influencers with high degree centrality are well-connected within the network.
Betweenness Centrality: Measures how often an influencer lies on the shortest path between other influencers. High betweenness indicates an influencer who bridges different communities.
Closeness Centrality: Measures how close an influencer is to all other influencers in the network. High closeness means an influencer can quickly reach others.
Eigenvector Centrality: Measures an influencer's influence based on the influence of their connections. Being connected to other influential influencers increases this metric.
PageRank: Similar to eigenvector centrality but with additional considerations for the "importance" of connections.

A smaller distance means the influencers have similar positions and roles in the network.

Community Similarity
This is a binary score (0 or 1) that checks if two influencers belong to the same community. Communities are detected using the Louvain method, which identifies groups of influencers that are more densely connected to each other than to the rest of the network.
Influencers in the same community often share similar characteristics, audiences, or content types, making them potentially good collaboration partners.

Direct Connection Bonus
This adds a bonus score (0.5) if the two influencers are already directly connected in the network.
A direct connection indicates that the influencers already have some similarity based on the edge creation criteria (same category, country, subscriber range, or engagement patterns).

These criteria are not equally weighted. When building the graph, each factor is assigned a fixed weight to reflect its relative importance:

> Category: 0.5

> Engagement Rate: 0.3

> Subscriber Range: 0.15

> Country: 0.05

These weights prioritize content relevance and engagement over geography, leading to more meaningful recommendation connections.

Additionally, the system performs anomaly detection using Z-scores on engagement rate and comment rate. Influencers who fall far outside the normal range (e.g., with extremely high engagement patterns) are flagged as suspicious and excluded from the network to ensure recommendation quality.

To interpret the graph:
Key Elements of the Graph

> Nodes –
Each node represents an influencer
Size: The target influencer is represented by a larger node than the recommended influencers
Color: Nodes are colored according to their community membership (influencers in the same community have the same color)

> Edges (Lines) –
Edges connect influencers who have similarity based on the criteria defined in the build_graph function
Thickness: Thicker edges indicate stronger similarity between influencers
The similarity is based on shared category, country, subscriber range, and engagement patterns

> Labels –
Each node is labeled with the influencer's name
These labels help you identify specific influencers in the network

> Legend –
Located in the upper right corner
Shows which color corresponds to which community number
Helps you identify which influencers belong to the same community

How to Interpret the Graph
> Central Position
Influencers positioned more centrally in the visualization are typically more connected within this specific subgraph
The layout algorithm (spring_layout) positions nodes based on their connections, with more connected nodes often appearing more central

> Clusters
Groups of nodes positioned close together often represent influencers that are more similar to each other
These clusters might indicate potential collaboration groups beyond just pairwise collaborations

> Edge Density
Areas with many interconnected edges indicate groups of influencers that are all similar to each other
Sparse connections might indicate influencers that are similar to the target but not necessarily to each other

Influencers with the same color belong to the same community in the larger network
If the recommended influencers have diverse colors, it suggests the target influencer could bridge different communities
If most recommended influencers share the same color as the target, it suggests the target is firmly embedded in one communit