In [None]:
import numpy as np

class PUAdapter(object):
    """
    Adapts any probabilistic binary classifier to positive-unlabled learning using the PosOnly method proposed by
    Elkan and Noto:

    Elkan, Charles, and Keith Noto. \"Learning classifiers from only positive and unlabeled data.\"
    Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008.
    """
    
    def __init__(self, estimator, hold_out_ratio=0.1, precomputed_kernel=False):
        """
        estimator -- An estimator of p(s=1|x) that must implement:
                     * predict_proba(X): Takes X, which can be a list of feature vectors or a precomputed
                                         kernel matrix and outputs p(s=1|x) for each example in X
                     * fit(X,y): Takes X, which can be a list of feature vectors or a precomputed
                                 kernel matrix and takes y, which are the labels associated to the
                                 examples in X
        hold_out_ratio -- The ratio of training examples that must be held out of the training set of examples
                          to estimate p(s=1|y=1) after training the estimator
        precomputed_kernel -- Specifies if the X matrix for predict_proba and fit is a precomputed kernel matrix
        """
        self.estimator = estimator
        self.c = 1.0
        self.hold_out_ratio = hold_out_ratio
        
        if precomputed_kernel:
            self.fit = self.__fit_precomputed_kernel
        else:
            self.fit = self.__fit_no_precomputed_kernel

        self.estimator_fitted = False
        
    def __str__(self):
        return 'Estimator:' + str(self.estimator) + '\n' + 'p(s=1|y=1,x) ~= ' + str(self.c) + '\n' + \
            'Fitted: ' + str(self.estimator_fitted)
    
    
    def __fit_precomputed_kernel(self, X, y):
        """
        Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1) using a subset of the training examples
        X -- Precomputed kernel matrix
        y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0)
        """
        positives = np.where(y == 1.)[0]
        hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)

        if len(positives) <= hold_out_size:
            raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
        
        np.random.shuffle(positives)
        hold_out = positives[:hold_out_size]
        
        #Hold out test kernel matrix
        X_test_hold_out = X[hold_out]
        keep = list(set(np.arange(len(y))) - set(hold_out))
        X_test_hold_out = X_test_hold_out[:,keep]
        
        #New training kernel matrix
        X = X[:, keep]
        X = X[keep]

        y = np.delete(y, hold_out)
        
        self.estimator.fit(X, y)
        
        hold_out_predictions = self.estimator.predict_proba(X_test_hold_out)
        
        try:
            hold_out_predictions = hold_out_predictions[:,1]
        except:
            pass
        
        c = np.mean(hold_out_predictions)
        self.c = c
        
        self.estimator_fitted = True
        
        
    def __fit_no_precomputed_kernel(self, X, y):
        """
        Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1,x)

        X -- List of feature vectors
        y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0)
        """
        positives = np.where(y == 1.)[0]
        hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))

        if len(positives) <= hold_out_size:
            raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
        
        np.random.shuffle(positives)
        print(len(positives))
        print(hold_out_size)
        hold_out = positives[:hold_out_size]
        print(max(hold_out))
        print(len(X))
        X_hold_out = X[hold_out]
        X = np.delete(X, hold_out,0)
        
        y_hold_out = y[hold_out]
        y = np.delete(y, hold_out)
        
        self.estimator.fit(X, y)
        
        hold_out_predictions = self.estimator.predict(X_hold_out)
        
        try:
            hold_out_predictions = hold_out_predictions[:,1]
        except:
            pass
        
        c = np.mean(hold_out_predictions)
        self.c = c
        
        print("### HOLD OUT PREDICTIONS")
        print(hold_out_predictions)
        class_hold_out_prediction = hold_out_predictions
        #class_hold_out_prediction[class_hold_out_prediction> 0.5] = 1
        #class_hold_out_prediction[class_hold_out_prediction< 0.5] = -1
        
        #print(len(hold_out_predictions[hold_out_predictions < 0.5]))
        #print(class_hold_out_prediction)
        print("## YHOLDOUT")
        print(y_hold_out)
        
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import precision_score
        from sklearn.metrics import classification_report
        print(classification_report(y_hold_out, class_hold_out_prediction))
        print(confusion_matrix(y_hold_out, class_hold_out_prediction))
        
        self.estimator_fitted = True
        
    
    def predict_proba(self, X):
        """
        Predicts p(y=1|x) using the estimator and the value of p(s=1|y=1) estimated in fit(...)

        X -- List of feature vectors or a precomputed kernel matrix
        """
        if not self.estimator_fitted:
            raise Exception('The estimator must be fitted before calling predict_proba(...).')

        probabilistic_predictions = self.estimator.predict_proba(X)
        
        try:
            probabilistic_predictions = probabilistic_predictions[:,1]
        except:
            pass
        
        return probabilistic_predictions / self.c
    
    
    def predict(self, X, treshold=0.5):
        """
        Assign labels to feature vectors based on the estimator's predictions

        X -- List of feature vectors or a precomputed kernel matrix
        treshold -- The decision treshold between the positive and the negative class
        """
        if not self.estimator_fitted:
            raise Exception('The estimator must be fitted before calling predict(...).')

        return np.array([1. if p > treshold else -1. for p in self.predict_proba(X)])

In [None]:
X_positive = positive_cases

In [None]:
X_positive.head()

In [None]:
X_unlabled = unlabled_cases.sample(8000)

In [None]:
X_unlabled.head()

In [None]:
X = X_unlabled.append(X_positive)

In [None]:
X = X.reset_index(drop=True)

In [None]:
X.head()

In [None]:
X.Cited.replace(0,-1, inplace=True)

In [None]:
y = X['Cited']

In [None]:
y.head()

In [None]:
X = X.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaBoost = AdaBoostClassifier(n_estimators=50)

In [None]:
nb_estimator = BernoulliNB()

In [None]:
pu_estimator = PUAdapter(adaBoost, hold_out_ratio=0.1)

In [None]:
pu_estimator.fit(X.as_matrix(),y.as_matrix())

In [None]:
print(pu_estimator)

In [None]:
predicted_values = pu_estimator.predict(unlabled_cases.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1).as_matrix())

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

true_class = unlabled_cases.Cited.replace(0,-1)
print(classification_report(true_class, predicted_values))
print(confusion_matrix(true_class, predicted_values))

In [None]:
print("Comparison of estimator and PUAdapter(estimator):")
print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == nb_estimator.predict(X)) == False)[0]))
print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == nb_estimator.predict(X)) == True)[0]))

In [None]:
X.loc[[12,13]]

In [None]:
X_unlabled = unlabled_cases.drop('Cited', axis=1)

In [None]:
X_unlabled.head()

### PU-Learning using Naive Bayes and EM
Two Step Method
(1) Find the reliable negative documents from the data
(2) 


In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
NB_bernouli = BernoulliNB()

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [None]:

scores = cross_val_score(NB_bernouli, X, y, cv=5, scoring='')

In [452]:
import pandas as pd
import numpy as np

In [453]:
input_data = pd.read_csv('/Users/anwar/java_workspace/Graph-Mining-Project/output/graph_features_max_depth_4/feature_vector_complete.csv')

input_data = input_data.fillna(np.mean(input_data['D2VAvgCosDist']))

input_data.isnull().sum()

AuthorId                 0
ArticleId                0
ShortestDistance         0
RandomWalkProbability    0
CurrentScoringMethod     0
PathLength=1             0
PathLength=2             0
PathLength=3             0
PathLength=4             0
Cited                    0
N2VAvgCosDist            0
D2VAvgCosDist            0
dtype: int64

In [454]:
positive_cases = input_data[input_data['Cited'] == 1]

input_data.head()

Unnamed: 0,AuthorId,ArticleId,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,Cited,N2VAvgCosDist,D2VAvgCosDist
0,a_71187,571857,2,0.032341,0.009708,0,5,81,2676,1.0,0.386456,0.295523
1,a_71187,571835,4,0.000303,7.6e-05,0,0,0,26,0.0,0.350226,0.30325
2,a_71187,571856,4,0.000861,0.000215,0,0,0,87,0.0,0.448654,0.26944
3,a_71187,571834,4,0.000224,5.6e-05,0,0,0,7,0.0,0.449872,0.343334
4,a_71187,571877,-1,0.0,0.0,0,0,0,0,0.0,0.44469,0.264319


In [455]:
positive_cases.head()
positive_cases.shape

(3893, 12)

In [456]:
unlabled_cases = input_data[input_data['Cited'] == 0]

In [457]:
unlabled_cases.head()
unlabled_cases.shape

(159994, 12)

In [458]:
P = positive_cases.reset_index(drop=True)
P_hold_out = P.sample(frac=0.15)
P = P.drop(P_hold_out.index)
P.head()
P_hold_out.head()

U = unlabled_cases.reset_index(drop=True)
U.Cited.replace(0,-1, inplace = True)
U_hold_out = U.sample(frac=0.025)
U = U.drop(U_hold_out.index)
U.head()
U_hold_out.head()

X_input = P.append(U)
X_input = X_input.reset_index(drop= True)
y_input = X_input['Cited']
X_input = X_input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)
X_input.ShortestDistance.replace(-1, 9999, inplace=True)
X_input.head()

Unnamed: 0,ShortestDistance,RandomWalkProbability,CurrentScoringMethod,PathLength=1,PathLength=2,PathLength=3,PathLength=4,N2VAvgCosDist,D2VAvgCosDist
0,2,0.032341,0.009708,0,5,81,2676,0.386456,0.295523
1,2,0.044821,0.015211,0,11,84,2001,0.355503,0.268981
2,2,0.003465,0.001258,0,1,2,97,0.278734,0.163345
3,2,0.001315,0.000453,0,1,17,356,0.443134,0.298214
4,2,0.009318,0.003383,0,5,61,1518,0.433034,0.298214


In [459]:
classifier = adb_classfier

In [460]:
isConverged = False
U_Input = U.reset_index(drop=True)
U_Input.ShortestDistance.replace(-1, 9999)
RN = U_Input
RN_threshold = 0.95
iterationCount = 0
while(not isConverged):
    X_input = P.append(RN)
    X_input = X_input.reset_index(drop= True)
    y_input = X_input['Cited']
    X_input = X_input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1)
    X_input.ShortestDistance.replace(-1, 9999, inplace=True)
    #print(y_input)
    classifier.fit(X_input, y_input)
    U_Input = U_Input.reset_index(drop=True)
    U_probabilities = classifier.predict_proba(U_Input.drop(['Cited', 'AuthorId', 'ArticleId'], axis=1))
    print(U_probabilities)
    RN_index = np.where(U_probabilities[:,0]>RN_threshold)[0]
    print(RN_index)
    RN = U_Input.iloc[RN_index,:]
    U_Input = U_Input.drop(RN_index)
    print("Number of RN- ", len(RN))
    iterationCount+=1
    RN_threshold *= 0.9
    if(len(RN) <1):
        isConverged = True
        print("NO RN Found")
    if(iterationCount >30):
        isConverged = True
        print("Max Iteration Count Reached")
        
        

[[ 0.97153007  0.02846993]
 [ 0.97153007  0.02846993]
 [ 0.97153007  0.02846993]
 ..., 
 [ 0.97518052  0.02481948]
 [ 0.97828708  0.02171292]
 [ 0.97654459  0.02345541]]
[     0      1      2 ..., 155991 155992 155993]
Number of RN-  147988
[[  2.22044605e-16   1.00000000e+00]
 [  2.22044605e-16   1.00000000e+00]
 [  2.22044605e-16   1.00000000e+00]
 ..., 
 [  2.22044605e-16   1.00000000e+00]
 [  2.22044605e-16   1.00000000e+00]
 [  2.22044605e-16   1.00000000e+00]]
[]
Number of RN-  0
NO RN Found


In [461]:
X_test = P_hold_out.append(U_hold_out)
X_test = X_test.reset_index(drop=True)
Y_test = X_test['Cited']
X_test = X_test.drop(['Cited', 'AuthorId', 'ArticleId'],  axis =1)

Y_test_predict = classifier.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_test_predict))

In [463]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_test, Y_test_predict,labels=[-1,1])

array([[3800,  200],
       [   0,  584]])

In [363]:
nb_classifier.classes_

AttributeError: 'MultinomialNB' object has no attribute 'classes_'

test = U_Input.iloc[[2,5,7],:]
test

In [252]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(fit_prior=False)

from sklearn.ensemble import AdaBoostClassifier
adb_classfier = AdaBoostClassifier(n_estimators=10)

from sklearn.svm import SVC
svc_classifier = SVC()

from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=6)

from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(hidden_layer_sizes=(20,20,20))

U_probabilities1 = np.array([[1,2],[3,4],[5,6]])
U_probabilities1

b= np.where(U_probabilities1>5)[0]
print(b)

U_probabilities[:,1]

print(U_probabilities)

In [364]:
from sklearn import tree
tree.export_graphviz(dt_clf, out_file='/Users/anwar/jupyter_workspace/tree1.dot')                



In [349]:
import pydot

In [350]:
from sklearn.externals.six import StringIO
import pydot 

dot_data = StringIO() 
tree.export_graphviz(dt_clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("/Users/anwar/jupyter_workspace/graph.pdf") 

NameError: name 'dot_parser' is not defined