In [6]:
import networkx as nx
import pandas as pd
import os
import random
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the edge list
DATA_PATH = os.path.join('data')
EDGES_TRAIN_PATH = os.path.join(DATA_PATH, 'edges_train.edgelist')
ATTRIBUTES_PATH = os.path.join(DATA_PATH, 'attributes.csv')
SOLUTION_INPUT_PATH = os.path.join(DATA_PATH, 'solutionInput.csv')

# Best performing Model

In [16]:
def create_features(G, node_pairs, attributes):
    features = []
    for node1, node2 in node_pairs:
        common_neighbors = len(list(nx.common_neighbors(G, node1, node2)))
        jaccard = list(nx.jaccard_coefficient(G, [(node1, node2)]))[0][2]
        pa = list(nx.preferential_attachment(G, [(node1, node2)]))[0][2]
        aa = list(nx.adamic_adar_index(G, [(node1, node2)]))[0][2]
        
        attr1 = attributes.loc[attributes['ID'] == node1, 'attribute'].values[0]
        attr2 = attributes.loc[attributes['ID'] == node2, 'attribute'].values[0]
        same_attribute = int(attr1 == attr2)
        
        degree_diff = abs(G.degree(node1) - G.degree(node2))
        total_degree = G.degree(node1) + G.degree(node2)

        clustering1 = nx.clustering(G, node1)
        clustering2 = nx.clustering(G, node2)
        avg_clustering = (clustering1 + clustering2) / 2

        communities = nx.community.label_propagation_communities(G)
        community_dict = {node: i for i, community in enumerate(communities) for node in community}
        same_community = int(community_dict.get(node1, -1) == community_dict.get(node2, -2))
        
        features.append([common_neighbors, jaccard, pa, aa, same_attribute, degree_diff, total_degree, avg_clustering, same_community])
    
    return np.array(features)

# Load data
edges = pd.read_csv(EDGES_TRAIN_PATH, names=['source', 'target'], sep=',')
G = nx.from_pandas_edgelist(edges, 'source', 'target')
attributes = pd.read_csv(ATTRIBUTES_PATH)

# Create positive and negative examples
positive_examples = list(G.edges())
non_edges = list(nx.non_edges(G))
negative_examples = random.sample(non_edges, len(positive_examples))

all_examples = positive_examples + negative_examples
labels = [1] * len(positive_examples) + [0] * len(negative_examples)

features = create_features(G, all_examples, attributes)

# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=42)


rf_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 1000],  # Default: 100
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],  # Default: None
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30],  # Default: 2
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],  # Default: 1
    'max_features': [None, 'sqrt', 'log2'], # Default: None
    'max_leaf_nodes': [None, 10, 20, 50, 100, 200, 500],  # Default: None
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1, 0.15, 0.2],  # Default: 0.0
    'bootstrap': [True, False],  # Default: True
    'oob_score': [True, False],  # Default: False
    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Default: None
    'ccp_alpha': [0.0, 0.01, 0.05, 0.1],  # Default: 0.0
    'max_samples': [None, 0.5, 0.7, 0.9]  # Default: None
}

# Random Forest tuning
rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=200,  
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='f1'  # Using F1 score for optimization
)
rf_random.fit(X_train, y_train)

print("Best Random Forest parameters:", rf_random.best_params_)
print("Best Random Forest cross-validation score:", rf_random.best_score_)


best_rf = rf_random.best_estimator_

y_pred = best_rf.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")


feature_names = ['Common Neighbors', 'Jaccard Coefficient', 'Preferential Attachment', 'Adamic-Adar Index', 
                 'Same Attribute', 'Degree Difference', 'Total Degree', 'Avg Clustering', 'Same Community']
importances = best_rf.feature_importances_
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")


test_pairs = pd.read_csv(SOLUTION_INPUT_PATH)
test_features = create_features(G, zip(test_pairs['int1'], test_pairs['int2']), attributes)
test_features_scaled = scaler.transform(test_features)

final_predictions = best_rf.predict(test_features_scaled)

submission = pd.DataFrame({
    'ID': test_pairs['ID'],
    'Prediction': final_predictions
})

submission.to_csv('rf_tuned_submission.csv', index=False)

print(f"\nNumber of predicted links: {sum(final_predictions)}")
print("Submission file created: rf_tuned_submission.csv")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END bootstrap=False, ccp_alpha=0.01, class_weight=None, max_depth=20, max_features=log2, max_leaf_nodes=20, max_samples=None, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=5, n_estimators=300, oob_score=True; total time=   0.0s
[CV] END bootstrap=False, ccp_alpha=0.01, class_weight=None, max_depth=20, max_features=log2, max_leaf_nodes=20, max_samples=None, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=5, n_estimators=300, oob_score=True; total time=   0.0s
[CV] END bootstrap=False, ccp_alpha=0.01, class_weight=None, max_depth=20, max_features=log2, max_leaf_nodes=20, max_samples=None, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=5, n_estimators=300, oob_score=True; total time=   0.0s
[CV] END bootstrap=False, ccp_alpha=0.01, class_weight=None, max_depth=20, max_features=None, max_leaf_nodes=200, max_samples=0.7, min_impurity_decrease=0.01, min_samples_leaf=8

540 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/debates_analysis/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/debates_analysis/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/debates_analysis/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estima

Best Random Forest parameters: {'oob_score': False, 'n_estimators': 500, 'min_samples_split': 20, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_samples': 0.7, 'max_leaf_nodes': 50, 'max_features': None, 'max_depth': 90, 'class_weight': None, 'ccp_alpha': 0.0, 'bootstrap': True}
Best Random Forest cross-validation score: 0.900984814301863

Test Set Performance:
Accuracy: 0.8955
Precision: 0.8836
Recall: 0.9092
F1 Score: 0.8962
Common Neighbors: 0.0013
Jaccard Coefficient: 0.0045
Preferential Attachment: 0.0493
Adamic-Adar Index: 0.0138
Same Attribute: 0.0013
Degree Difference: 0.0082
Total Degree: 0.0404
Avg Clustering: 0.0387
Same Community: 0.8425

Number of predicted links: 705
Submission file created: rf_tuned_submission.csv


In [19]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

rf = RandomForestClassifier(
    n_estimators=500, min_samples_split=20, min_samples_leaf=4, 
    max_samples=0.7, max_leaf_nodes=50, max_features=None, 
    max_depth=90, bootstrap=True, random_state=42
)

gb = GradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
lr = LogisticRegression(random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('xgb', xgb), ('lr', lr)],
    voting='soft'
)

ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Ensemble Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Make predictions on the submission set
test_features = create_features(G, zip(test_pairs['int1'], test_pairs['int2']), attributes)
test_features_scaled = scaler.transform(test_features)

ensemble_predictions = ensemble.predict(test_features_scaled)

submission = pd.DataFrame({
    'ID': test_pairs['ID'],
    'Prediction': ensemble_predictions
})

submission.to_csv('ensemble_submission.csv', index=False)

print(f"\nNumber of predicted links: {sum(ensemble_predictions)}")
print("Submission file created: ensemble_submission.csv")

Ensemble Performance:
Accuracy: 0.8962
Precision: 0.8760
Recall: 0.9214
F1 Score: 0.8981

Number of predicted links: 715
Submission file created: ensemble_submission.csv
