In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import networkx as nx
import scipy.sparse as sp
from scipy.stats import randint as sp_randint
from sklearn  import preprocessing
from sklearn.preprocessing  import StandardScaler
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from time import time
from sklearn.decomposition import PCA
from sklearn import svm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 讀取所需資料集

In [3]:
x_train_final = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/X_train_train.csv")
x_test_final = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/X_train_valid.csv")
print("train： %s, test： %s" % (len(x_train_final),len(x_test_final)))

y_train_final = x_train_final['link']
y_test_final = x_test_final['link']
x_train_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_test_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 22910, test： 5728


In [4]:
x_train_final2 = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/X_train.csv")
x_test_final2 = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/X_test.csv")
print("train： %s, test： %s" % (len(x_train_final2),len(x_test_final2)))

y_train_final2 = x_train_final2['link']
y_test_final2 = x_test_final2['link']
x_train_final2.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_test_final2.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 28638, test： 12276


In [5]:
x_train_final3 = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/train_all.csv")
x_pred_final = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/predict_all.csv")
print("train： %s, predict： %s" % (len(x_train_final3),len(x_pred_final)))

y_train_final3 = x_train_final3['link']

x_train_final3.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_pred_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 40914, predict： 10231


## 用 XGboost 預測

In [17]:
from xgboost.sklearn import XGBClassifier

In [18]:
#先把dataframe轉成np.array型態
xg_X_train = x_train_final.values
xg_X_test = x_test_final.values
xg_y_train = y_train_final.values
xg_y_test = y_test_final.values

In [19]:
Xgbc=XGBClassifier(random_state=2019)
Xgbc.fit(xg_X_train,xg_y_train)
y_xgbc_pred=Xgbc.predict(xg_X_test)
Xgbc_score=accuracy_score(xg_y_test,y_xgbc_pred) #準確率

In [20]:
Xgbc_score

0.977304469273743

In [21]:
from sklearn.metrics import classification_report
print(classification_report(xg_y_test,y_xgbc_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98      2864
         1.0       0.98      0.97      0.98      2864

    accuracy                           0.98      5728
   macro avg       0.98      0.98      0.98      5728
weighted avg       0.98      0.98      0.98      5728



In [27]:
xg_predic = x_pred_final.values
y_pred = Xgbc.predict(xg_predic)
y_pred

array([1., 1., 0., ..., 1., 0., 1.])

In [26]:
answer = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_example.csv")
answer['ans'] = [int(i) for i in y_pred] # turn float to int type
answer.to_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_30_xgb_node2vec.csv", index=False )

## 用 RandomFroest 做預測

In [7]:
#train model
start_time = time()
param_dist = {"n_estimators":sp_randint(100,150),
              "max_depth": sp_randint(10,20)}

clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
rf_random = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=5,cv=10,scoring='accuracy',random_state=25)

rf_random.fit(x_train_final, y_train_final)
print('mean test scores','\n',rf_random.cv_results_['mean_test_score'])
print("--- %s seconds ---" % (time() - start_time))

mean test scores 
 [0.98529027 0.98520297 0.98498472 0.98520297 0.98502837]
--- 162.19949078559875 seconds ---


In [8]:
# build model with the best parameters
clf = rf_random.best_estimator_
print(rf_random.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=14, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=126,
                       n_jobs=-1, oob_score=True, random_state=25, verbose=0,
                       warm_start=False)


In [9]:
clf.fit(x_train_final, y_train_final)
print (clf.oob_score_)

y_pred = clf.predict(x_test_final)
print ("Validation accuracy score: " ,accuracy_score(y_test_final, y_pred))

0.9845918812745525
Validation accuracy score:  0.9769553072625698


In [10]:
clf.fit(x_train_final2, y_train_final2)
print (clf.oob_score_)

y_pred = clf.predict(x_test_final2)
print('Testing accuracy score: ',accuracy_score(y_test_final2, y_pred))

0.9845310426705776
Testing accuracy score:  0.9713261648745519


In [11]:
clf.fit(x_train_final3, y_train_final3)
print (clf.oob_score_)

y_pred = clf.predict(x_pred_final)

0.9860927799775138


In [12]:
sum(y_pred)

5525.0

In [16]:
answer = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_example.csv")
answer['ans'] = [int(i) for i in y_pred] # turn float to int type
answer.to_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_30_rf_node2vec.csv", index=False )

## 用 RandomForest using PCA feature selection 做預測

### train-validation

In [6]:
def train_test_std(x_train_final, x_test_final):
    scaler = preprocessing.StandardScaler().fit(x_train_final)
    x_train_final_std = pd.DataFrame(scaler.transform(x_train_final.values), columns=x_train_final.columns)
    x_test_final_std = pd.DataFrame(scaler.transform(x_test_final.values), columns=x_test_final.columns)
    return x_train_final_std, x_test_final_std

In [7]:
x_train_final_std, x_test_final_std = train_test_std(x_train_final, x_test_final)

In [8]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_test_pca = pca.transform(x_test_final_std)

clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
clf.fit(x_train_pca ,  y_train_final)
rf_pred = clf.predict(x_test_pca)
print('Validation accuracy score: ',accuracy_score(y_test_final, rf_pred))

Validation accuracy score:  0.971717877094972


In [9]:
#std
clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
clf.fit(x_train_final_std,  y_train_final)
rf_pred = clf.predict(x_test_final_std)
print('Validation accuracy score: ',accuracy_score(y_test_final, rf_pred))

Validation accuracy score:  0.9769553072625698


### train-test

In [10]:
x_train_final_std, x_test_final_std = train_test_std(x_train_final2, x_test_final2)

In [11]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_test_pca = pca.transform(x_test_final_std)

clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
clf.fit(x_train_pca ,  y_train_final2)
rf_pred = clf.predict(x_test_pca)
print('Test accuracy score: ',accuracy_score(y_test_final2, rf_pred))

Test accuracy score:  0.9707559465623982


In [12]:
#std
clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
clf.fit(x_train_final_std,  y_train_final2)
rf_pred = clf.predict(x_test_final_std)
print('Test accuracy score: ',accuracy_score(y_test_final2, rf_pred))

Test accuracy score:  0.9716520039100685


### train_all-predict

In [40]:
x_train_final_std, x_pred_final_std = train_test_std(x_train_final3, x_pred_final)

In [41]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_pred_pca = pca.transform(x_pred_final_std)

In [42]:
#train model
start_time = time()
param_dist = {"n_estimators":sp_randint(100,150),
              "max_depth": sp_randint(10,20)}

clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
rf_random = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=5,cv=10,scoring='accuracy',random_state=25)

rf_random.fit(x_train_pca, y_train_final3)
print('mean test scores','\n',rf_random.cv_results_['mean_test_score'])
print("--- %s seconds ---" % (time() - start_time))

mean test scores 
 [0.98181544 0.98193761 0.98125323 0.98183988 0.98179103]
--- 386.13140320777893 seconds ---


In [43]:
# build model with the best parameters
clf = rf_random.best_estimator_
print(rf_random.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=123,
                       n_jobs=-1, oob_score=True, random_state=25, verbose=0,
                       warm_start=False)


In [45]:
clf.fit(x_train_pca, y_train_final3)
print (clf.oob_score_)

y_pred = clf.predict(x_pred_pca)


0.9816199833797722


In [46]:
answer = pd.read_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_example.csv")
answer['ans'] = [int(i) for i in y_pred] # turn float to int type
answer.to_csv("/content/drive/My Drive/dataset/SNA_HW01/ans_30_rf_PCA_node2vec.csv", index=False )