In [600]:
import numpy as np

class PUAdapter(object):
    """
    Adapts any probabilistic binary classifier to positive-unlabled learning using the PosOnly method proposed by
    Elkan and Noto:

    Elkan, Charles, and Keith Noto. \"Learning classifiers from only positive and unlabeled data.\"
    Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008.
    """
    
    def __init__(self, estimator, hold_out_ratio=0.1, precomputed_kernel=False):
        """
        estimator -- An estimator of p(s=1|x) that must implement:
                     * predict_proba(X): Takes X, which can be a list of feature vectors or a precomputed
                                         kernel matrix and outputs p(s=1|x) for each example in X
                     * fit(X,y): Takes X, which can be a list of feature vectors or a precomputed
                                 kernel matrix and takes y, which are the labels associated to the
                                 examples in X
        hold_out_ratio -- The ratio of training examples that must be held out of the training set of examples
                          to estimate p(s=1|y=1) after training the estimator
        precomputed_kernel -- Specifies if the X matrix for predict_proba and fit is a precomputed kernel matrix
        """
        self.estimator = estimator
        self.c = 1.0
        self.hold_out_ratio = hold_out_ratio
        
        if precomputed_kernel:
            self.fit = self.__fit_precomputed_kernel
        else:
            self.fit = self.__fit_no_precomputed_kernel

        self.estimator_fitted = False
        
    def __str__(self):
        return 'Estimator:' + str(self.estimator) + '\n' + 'p(s=1|y=1,x) ~= ' + str(self.c) + '\n' + \
            'Fitted: ' + str(self.estimator_fitted)
    
    
    def __fit_precomputed_kernel(self, X, y):
        """
        Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1) using a subset of the training examples
        X -- Precomputed kernel matrix
        y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0)
        """
        positives = np.where(y == 1.)[0]
        hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)

        if len(positives) <= hold_out_size:
            raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
        
        np.random.shuffle(positives)
        hold_out = positives[:hold_out_size]
        
        #Hold out test kernel matrix
        X_test_hold_out = X[hold_out]
        keep = list(set(np.arange(len(y))) - set(hold_out))
        X_test_hold_out = X_test_hold_out[:,keep]
        
        #New training kernel matrix
        X = X[:, keep]
        X = X[keep]

        y = np.delete(y, hold_out)
        
        self.estimator.fit(X, y)
        
        hold_out_predictions = self.estimator.predict_proba(X_test_hold_out)
        
        try:
            hold_out_predictions = hold_out_predictions[:,1]
        except:
            pass
        
        c = np.mean(hold_out_predictions)
        self.c = c
        
        self.estimator_fitted = True
        
        
    def __fit_no_precomputed_kernel(self, X, y):
        """
        Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1,x)

        X -- List of feature vectors
        y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0)
        """
        positives = np.where(y == 1.)[0]
        hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))

        if len(positives) <= hold_out_size:
            raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
        
        np.random.shuffle(positives)
        print(len(positives))
        print(hold_out_size)
        hold_out = positives[:hold_out_size]
        print(max(hold_out))
        print(len(X))
        X_hold_out = X[hold_out]
        X = np.delete(X, hold_out,0)
        
        y_hold_out = y[hold_out]
        y = np.delete(y, hold_out)
        
        self.estimator.fit(X, y)
        
        hold_out_predictions = self.estimator.predict(X_hold_out)
        
        try:
            hold_out_predictions = hold_out_predictions[:,1]
        except:
            pass
        
        c = np.mean(hold_out_predictions)
        self.c = c
        
        print("### HOLD OUT PREDICTIONS")
        print(hold_out_predictions)
        class_hold_out_prediction = hold_out_predictions
        #class_hold_out_prediction[class_hold_out_prediction> 0.5] = 1
        #class_hold_out_prediction[class_hold_out_prediction< 0.5] = -1
        
        #print(len(hold_out_predictions[hold_out_predictions < 0.5]))
        #print(class_hold_out_prediction)
        print("## YHOLDOUT")
        print(y_hold_out)
        
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import precision_score
        from sklearn.metrics import classification_report
        print(classification_report(y_hold_out, class_hold_out_prediction))
        print(confusion_matrix(y_hold_out, class_hold_out_prediction))
        
        self.estimator_fitted = True
        
    
    def predict_proba(self, X):
        """
        Predicts p(y=1|x) using the estimator and the value of p(s=1|y=1) estimated in fit(...)

        X -- List of feature vectors or a precomputed kernel matrix
        """
        if not self.estimator_fitted:
            raise Exception('The estimator must be fitted before calling predict_proba(...).')

        probabilistic_predictions = self.estimator.predict_proba(X)
        
        try:
            probabilistic_predictions = probabilistic_predictions[:,1]
        except:
            pass
        
        return probabilistic_predictions / self.c
    
    
    def predict(self, X, treshold=0.5):
        """
        Assign labels to feature vectors based on the estimator's predictions

        X -- List of feature vectors or a precomputed kernel matrix
        treshold -- The decision treshold between the positive and the negative class
        """
        if not self.estimator_fitted:
            raise Exception('The estimator must be fitted before calling predict(...).')

        return np.array([1. if p > treshold else -1. for p in self.predict_proba(X)])

In [601]:
X_positive = positive_cases

In [602]:
X_positive.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,Cited
0,a_71187,571857,2,0.032341,0.009708,0,5,81,2676,1.0
15,a_71187,571863,2,0.044821,0.015211,0,11,84,2001,1.0
17,a_71187,571862,2,0.030942,0.010616,0,8,60,1482,1.0
88,a_27227,1250962,2,0.003465,0.001258,0,1,2,97,1.0
89,a_27227,1250961,2,0.003534,0.001278,0,1,2,61,1.0


In [603]:
X_unlabled = unlabled_cases.sample(8000)

In [604]:
X_unlabled.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,Cited
29699,a_286489,2348477,-1,0.0,0.0,0,0,0,0,0.0
111788,a_175613,1066292,4,2e-06,3.926343e-07,0,0,0,3,0.0
65160,a_310165,1871252,3,0.003098,0.0009132975,0,0,2,17,0.0
71598,a_273513,1871217,4,1.8e-05,4.536501e-06,0,0,0,2,0.0
111299,a_253457,337398,4,0.002315,0.0005787037,0,0,0,1,0.0


In [605]:
X = X_unlabled.append(X_positive)

In [606]:
X = X.reset_index(drop=True)

In [607]:
X.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,Cited
0,a_286489,2348477,-1,0.0,0.0,0,0,0,0,0.0
1,a_175613,1066292,4,2e-06,3.926343e-07,0,0,0,3,0.0
2,a_310165,1871252,3,0.003098,0.0009132975,0,0,2,17,0.0
3,a_273513,1871217,4,1.8e-05,4.536501e-06,0,0,0,2,0.0
4,a_253457,337398,4,0.002315,0.0005787037,0,0,0,1,0.0


In [608]:
X.Cited.replace(0,-1, inplace=True)

In [609]:
y = X['Cited']

In [610]:
y.head()

0   -1.0
1   -1.0
2   -1.0
3   -1.0
4   -1.0
Name: Cited, dtype: float64

In [611]:
X = X.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)

In [612]:
from sklearn.naive_bayes import BernoulliNB

In [613]:
from sklearn.ensemble import AdaBoostClassifier
adaBoost = AdaBoostClassifier(n_estimators=50)

In [614]:
nb_estimator = BernoulliNB()

In [615]:
pu_estimator = PUAdapter(adaBoost, hold_out_ratio=0.1)

In [616]:
pu_estimator.fit(X.as_matrix(),y.as_matrix())

3893
390
11890
11893
### HOLD OUT PREDICTIONS
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1. 

  'recall', 'true', average, warn_for)


In [516]:
print(pu_estimator)

Estimator:AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)
p(s=1|y=1,x) ~= 0.984615384615
Fitted: True


In [618]:
predicted_values = pu_estimator.predict(unlabled_cases.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1).as_matrix())

In [623]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

true_class = unlabled_cases.Cited.replace(0,-1)
print(classification_report(true_class, predicted_values))
print(confusion_matrix(true_class, predicted_values))

             precision    recall  f1-score   support

       -1.0       1.00      0.95      0.97    159994
        1.0       0.00      0.00      0.00         0

avg / total       1.00      0.95      0.97    159994

[[151853   8141]
 [     0      0]]


  'recall', 'true', average, warn_for)


In [182]:
print("Comparison of estimator and PUAdapter(estimator):")
print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == nb_estimator.predict(X)) == False)[0]))
print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == nb_estimator.predict(X)) == True)[0]))

Comparison of estimator and PUAdapter(estimator):
Number of disagreements:  0
Number of agreements:  163887


In [98]:
X.loc[[12,13]]

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4
12,a_100769,223948,4,0.000137,3.4e-05,0,0,0,5
13,a_164482,258588,4,0.000281,7e-05,0,0,0,5


In [18]:
X_unlabled = unlabled_cases.drop('Cited', axis=1)

In [19]:
X_unlabled.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4
1,a_71187,571835,4,0.000303,7.6e-05,0,0,0,26
2,a_71187,571856,4,0.000861,0.000215,0,0,0,87
3,a_71187,571834,4,0.000224,5.6e-05,0,0,0,7
4,a_71187,571877,-1,0.0,0.0,0,0,0,0
5,a_71187,571833,4,0.000337,8.4e-05,0,0,0,6


### PU-Learning using Naive Bayes and EM
Two Step Method
(1) Find the reliable negative documents from the data
(2) 


In [183]:
from sklearn.naive_bayes import BernoulliNB

In [185]:
NB_bernouli = BernoulliNB()

In [186]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [190]:

scores = cross_val_score(NB_bernouli, X, y, cv=5, scoring='')

ValueError: 'confusion_matrix' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']

In [190]:
import pandas as pd
import numpy as np

In [191]:
input_data = pd.read_csv('/Users/anwar/java_workspace/Graph-Mining-Project/output/graph_features_max_depth_6/feature_vector_Wclass.csv')

In [192]:
input_data.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6,Cited
0,a_27227,1250919,4,0.009957,0.001788,0,0,0,4,106,4456,0.0
1,a_27227,1250918,4,0.086772,0.014963,0,0,0,25,921,30697,0.0
2,a_27227,1250955,4,0.123706,0.022744,0,0,2,36,1143,29539,0.0
3,a_27227,1250954,2,0.078837,0.028206,1,1,9,239,3046,40411,0.0
4,a_27227,1250913,3,0.206209,0.037707,0,0,6,100,3149,71164,0.0


In [193]:
positive_cases = input_data[input_data['Cited'] == 1]

In [194]:
positive_cases.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6,Cited
45,a_27227,1250962,2,0.03633,0.007067,0,1,2,97,1982,45263,1.0
46,a_27227,1250961,2,0.031296,0.006219,0,1,2,61,1323,29245,1.0
49,a_19843,2076766,2,0.045978,0.008802,0,1,4,82,1434,34798,1.0
55,a_19843,2076746,2,0.147899,0.026586,0,2,5,111,2165,74796,1.0
93,a_320660,1631278,2,0.114434,0.022963,0,4,23,661,11074,269910,1.0


In [195]:
unlabled_cases = input_data[input_data['Cited'] == 0]

In [196]:
unlabled_cases.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6,Cited
0,a_27227,1250919,4,0.009957,0.001788,0,0,0,4,106,4456,0.0
1,a_27227,1250918,4,0.086772,0.014963,0,0,0,25,921,30697,0.0
2,a_27227,1250955,4,0.123706,0.022744,0,0,2,36,1143,29539,0.0
3,a_27227,1250954,2,0.078837,0.028206,1,1,9,239,3046,40411,0.0
4,a_27227,1250913,3,0.206209,0.037707,0,0,6,100,3149,71164,0.0


In [197]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(fit_prior=False)

In [262]:
from sklearn.ensemble import AdaBoostClassifier
adb_classfier = AdaBoostClassifier(n_estimators=50)

In [263]:
from sklearn.svm import SVC
svc_classifier = SVC()

In [264]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

In [265]:
from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(hidden_layer_sizes=(20,20,20))

In [266]:
P = positive_cases.reset_index(drop=True)
P_hold_out = P.sample(frac=0.15)
P = P.drop(P_hold_out.index)
P.head()
P_hold_out.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6,Cited
97,a_201027,1944752,2,0.11673,0.021883,0,2,29,908,25215,816415,1.0
633,a_240280,266108,2,0.15117,0.032358,0,6,67,1936,31071,624963,1.0
479,a_215844,1141923,2,0.119335,0.027021,0,8,275,7631,206486,6005607,1.0
404,a_23911,1276447,2,0.048435,0.009579,0,2,10,507,19909,906352,1.0
903,a_251449,1871302,2,0.013079,0.002374,0,1,6,104,2253,69588,1.0


In [267]:
U = unlabled_cases.reset_index(drop=True)
U.Cited.replace(0,-1, inplace = True)
U_hold_out = U.sample(frac=0.02)
U = U.drop(U_hold_out.index)
U.head()
U_hold_out.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6,Cited
26288,a_50066,1866181,4,0.004837,0.00084,0,0,0,8,1023,80215,-1.0
7919,a_299770,1277841,4,0.079556,0.014195,0,0,14,336,9689,345989,-1.0
15484,a_17483,859623,3,0.030299,0.005792,0,0,11,881,32992,1284675,-1.0
24248,a_151763,1526782,4,1.326168,0.228626,0,0,0,51,4329,214505,-1.0
26687,a_43772,1835902,4,0.031486,0.005567,0,0,0,1,127,4655,-1.0


In [268]:
X_input = P.append(U)
X_input = X_input.reset_index(drop= True)
y_input = X_input['Cited']
X_input = X_input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)
X_input.ShortestDistance.replace(-1, 9999, inplace=True)
X_input.head()

Unnamed: 0,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,PathLength=5,PathLength=6
0,2,0.03633,0.007067,0,1,2,97,1982,45263
1,2,0.031296,0.006219,0,1,2,61,1323,29245
2,2,0.045978,0.008802,0,1,4,82,1434,34798
3,2,0.147899,0.026586,0,2,5,111,2165,74796
4,2,0.114434,0.022963,0,4,23,661,11074,269910


In [269]:
isConverged = False
U_Input = U.reset_index(drop=True)
U_Input.ShortestDistance.replace(-1, 9999)
RN = U_Input
RN_threshold = 0.99999
iterationCount = 0
while(not isConverged):
    X_input = P.append(RN)
    X_input = X_input.reset_index(drop= True)
    y_input = X_input['Cited']
    X_input = X_input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)
    X_input.ShortestDistance.replace(-1, 9999, inplace=True)
    #print(y_input)
    adb_classfier.fit(X_input, y_input)
    U_Input = U_Input.reset_index(drop=True)
    U_probabilities = adb_classfier.predict_proba(U_Input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1))
    print(U_probabilities)
    RN_index = np.where(U_probabilities[:,0]>RN_threshold)[0]
    print(RN_index)
    RN = U_Input.iloc[RN_index,:]
    U_Input = U_Input.drop(RN_index)
    print("Number of RN- ", len(RN))
    iterationCount+=1
    RN_threshold *= 0.9
    if(len(RN) <1):
        isConverged = True
        print("NO RN Found")
    if(iterationCount >30):
        isConverged = True
        print("Max Iteration Count Reached")
        
        

[[ 0.67479746  0.32520254]
 [ 0.66734934  0.33265066]
 [ 0.66867404  0.33132596]
 ..., 
 [ 0.66947413  0.33052587]
 [ 0.67306359  0.32693641]
 [ 0.67220844  0.32779156]]
[]
Number of RN-  0
NO RN Found


In [270]:
X_test = P_hold_out.append(U_hold_out)
X_test = X_test.reset_index(drop=True)
Y_test = X_test['Cited']
X_test = X_test.drop(['Cited', 'AuthorId', 'ArticleId'],  axis =1)

Y_test_predict = adb_classfier.predict(X_test)

In [271]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_test_predict))

             precision    recall  f1-score   support

       -1.0       0.88      1.00      0.94       862
        1.0       0.90      0.23      0.37       149

avg / total       0.88      0.88      0.85      1011



In [272]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_test, Y_test_predict,labels=[-1,1])

array([[858,   4],
       [114,  35]])

In [273]:
adb_classfier.classes_

array([-1.,  1.])

test = U_Input.iloc[[2,5,7],:]
test

U_probabilities1 = np.array([[1,2],[3,4],[5,6]])
U_probabilities1

b= np.where(U_probabilities1>5)[0]
print(b)

U_probabilities[:,1]

print(U_probabilities)