## Basic Co-Training Implementation
**Based on the Blum and Mitchell, 1998 paper**

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import math
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score


In [4]:
#setA = ['rec.sport.baseball', 'rec.sport.hockey']
setA = ['comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware']
setB = ['talk.politics.misc','talk.politics.guns']

newsA_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=setA)
newsB_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=setB)

newsA_y_train = newsA_train.target
newsB_y_train = newsB_train.target

vectorizer = TfidfVectorizer()
newsA_X_train = vectorizer.fit_transform(newsA_train.data)
newsB_X_train = vectorizer.fit_transform(newsB_train.data)


In [8]:
print(newsA_y_train.shape)
print(newsA_X_train.shape)
print(newsB_y_train.shape)
print(newsB_X_train.shape)

[0 1 1 ..., 0 0 1]
(1181, 39223)
[0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0
 1 0 0 0 1 0 0 1 1 1 1 0 1]
(1011, 19825)


In [20]:
#print(np.argsort(newsA_y_train))
orderA = np.argsort(newsA_y_train)
newsA_X_train = newsA_X_train[orderA]
newsA_y_train = newsA_y_train[orderA]
print(newsA_y_train.shape)

orderB = np.argsort(newsB_y_train)
newsB_X_train = newsB_X_train[orderB]
newsB_y_train = newsB_y_train[orderB]
print(newsB_y_train.shape)

# cut off the extras so both views same size
newsA_X_train = newsA_X_train[:min(len(newsA_y_train),len(newsB_y_train))]
newsA_y_train = newsA_y_train[:min(len(newsA_y_train),len(newsB_y_train))]

print(newsA_y_train.shape)



(1181,)
(1011,)
(1011,)


In [21]:
print((newsA_train.keys()))

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])


In [83]:
# split into labeled and unlabeled
Lsize = 12

np.random.seed(8)
mask = np.random.choice([False, True], (newsA_X_train.shape[0]), p=[((newsA_X_train.shape[0]) - Lsize)/(newsA_X_train.shape[0]), Lsize/(newsA_X_train.shape[0])])


newsA_X_train_L = newsA_X_train[mask,:]
newsA_y_train_L = newsA_y_train[mask]
newsA_X_train_U = newsA_X_train[mask==False,:]

newsB_X_train_L = newsB_X_train[mask,:]
newsB_y_train_L = newsB_y_train[mask]
newsB_X_train_U = newsB_X_train[mask==False,:]

print(newsA_y_train_L)
print(newsA_X_train_L.shape)
print(newsA_X_train_U.shape)


[0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]
(16, 39223)
(995, 39223)


In [227]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
class MVCoTrain:
    
    def __init__(self, model_type="Naive_Bayes"):
        if model_type == "Naive_Bayes":
            self.h1 = GaussianNB()
            self.h2 = GaussianNB()
        else:
            raise Exception("Bad model_type")
        self.model_type = model_type
        
    # return the most confident row indices for positive and negative
    def pred_self(self, modelnum, Z_X_UU, n, p):
        if modelnum=="h1":
            probs = self.h1.predict_proba(Z_X_UU)
            y_pred = self.h1.predict(Z_X_UU)
        elif modelnum=="h2":
            probs = self.h2.predict_proba(Z_X_UU)
            y_pred = self.h2.predict(Z_X_UU)
                    
        probs_class0 = probs[:,0]
        probs_class1 = probs[:,1]
        topN_class0 = np.argsort(probs_class0)[-n:]
        topN_pred0 = y_pred[topN_class0]
#         print(" hhhh ")
#         print(y_pred)
#         print(probs)
        
#         print("here")
#         print(probs_class0[topN_class0])
#         print(topN_pred0)
        topP_class1 = np.argsort(probs_class1)[-p:]
        topP_pred1 = y_pred[topP_class1]
#         print(probs_class1[topP_class1])
#         print(topP_pred1)
        best_locs = np.transpose(np.hstack([topN_class0, topP_class1]))
        preds = np.transpose(np.hstack([topN_pred0, topP_pred1]))
#         print(best_locs)
#         print(preds)
        return (best_locs, preds)
    
    
    def fit_full(self, A_X_L, A_y_L, B_X_L, B_y_L, A_X_U, B_X_U, n=30, p=30, max_iters=20, seed=10):
        # randomly get subgroup of the unlabeled data
        UUsize = 4 * n + 4 * p
        np.random.seed(seed)
        mask = np.random.choice([False, True], (A_X_U.shape[0]), p=[((A_X_U.shape[0]) - UUsize)/(A_X_U.shape[0]), UUsize/(A_X_U.shape[0])])
        print(mask[:100])
        mask = np.flatnonzero(mask)
        print(mask)
        A_X_UU = A_X_U[mask,:]
        B_X_UU = B_X_U[mask,:]
        A_X_U = A_X_U[mask,:]
        B_X_U = B_X_U[mask,:]
        
        
        A_X_L = A_X_L.toarray()
        B_X_L = B_X_L.toarray()
        A_X_U = A_X_U.toarray()
        B_X_U = B_X_U.toarray()
        A_X_UU = A_X_UU.toarray()
        B_X_UU = B_X_UU.toarray()
           
        # iterate and fit
        for n_iter in range(2):
            # fit h1 on the labeled data A_X_L
            self.h1.fit(A_X_L, A_y_L)
            
            # fit h2 on labeled data B_X_L
            self.h2.fit(B_X_L, B_y_L)
            
            # get h1 predictions of A_X_UU
            best_rows1, best_preds1 = self.pred_self("h1", A_X_UU, n, p)
            print(best_rows1.shape)
                    
            # get the h2 predictions of B_X_UU
            best_rows2, best_preds2 = self.pred_self("h2", B_X_UU, n, p)
            print(best_rows2.shape)
            
            # union these and put these best predictions into B_X_L, then replenish both UU
            best_rows = np.array(list(set(np.hstack([best_rows1, best_rows2]))))
            print(best_rows.shape)
            
            y_L_new = np.zeros_like(best_rows)
            remove_mask = np.zeros_like(best_rows)
            for i,row in enumerate(best_rows):
                remove_mask[i] = row
                if row in best_rows1:
                    y_L_new[i] = best_preds1[np.where(best_rows1==row)]
                else:
                    y_L_new[i] = best_preds2[np.where(best_rows2==row)]
                    
            
            print(y_L_new)
            
            # add to labeled sets
            print(A_X_L.shape)
            print(A_X_UU[best_rows,:].shape)
            
            A_X_L = np.vstack((A_X_L, A_X_UU[best_rows,:]))
            print(A_X_L.shape)
            B_X_L = np.vstack((B_X_L, B_X_UU[best_rows,:]))
            
            # Add labels########################
            
            # remove from unlabeled set
            print(remove_mask.shape)
            mask = np.ones((A_X_UU.shape[0],))
            mask[remove_mask] = 0
            print(A_X_UU.shape)
            A_X_UU = A_X_UU[mask==1,:]
            B_X_UU = B_X_UU[mask==1,:]
            print(A_X_UU.shape)
            
            # replenish UU sets
            
            
            
        
        
        

In [228]:
x = np.array([1,2,2,3,4,5,5,5,5,5,6])
y = np.array(list(set(x)))
print(y)

[1 2 3 4 5 6]


In [229]:
MVtest1 = MVCoTrain()
MVtest1.fit_full(newsA_X_train_L, newsA_y_train_L, newsB_X_train_L, newsB_y_train_L, newsA_X_train_U, newsB_X_train_U)

[ True False False False False False False  True False False False  True
 False False  True False False False  True False False False False False
 False False False False False False  True False  True False False False
 False  True False False False  True False  True False False False  True
 False False False False  True False False  True  True False  True False
 False  True False False False False False False False  True False False
 False False False False  True  True  True False False False False False
 False False False False False False False False False  True False False
  True False False False]
[  0   7  11  14  18  30  32  37  41  43  47  52  55  56  58  61  69  76
  77  78  93  96 101 104 118 134 135 139 141 143 153 155 168 170 171 173
 181 184 197 200 201 203 206 211 212 214 221 224 227 231 232 239 242 245
 248 252 254 258 263 267 268 269 270 272 276 282 283 286 291 294 295 302
 312 314 315 320 327 335 339 343 349 351 353 357 358 362 366 383 395 397
 401 404 409 410 415 417 

ValueError: Found input variables with inconsistent numbers of samples: [106, 16]

In [172]:
print(newsA_train.data)

AttributeError: 'list' object has no attribute 'shape'

### Using Diabetes Dataset

In [3]:
full_data = pd.read_csv("Diabetes_Dataset.csv")
full_labels = full_data['1']
full_data.head()
full_data = full_data.drop('1',1)

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [5]:
full_data.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


In [9]:
X = full_data.as_matrix()
Y = full_labels.as_matrix()

In [14]:
print(X.shape)

(767, 8)


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

In [165]:
# Split into two groups ("views")
Lsize = 20
X_train_L = X_train[:Lsize,:]
X_train_U = X_train[Lsize:,:]
y_train_L = y_train[:Lsize]
y_train_U = np.zeros_like(y_train[Lsize:])

In [166]:
print(X_train_L.shape)
print(X_train_U.shape)

(20, 8)
(670, 8)


In [167]:
from sklearn.naive_bayes import GaussianNB

# test fitting to a single view
gnb = GaussianNB()
gnb.fit(X_train_L, y_train_L)
probs = gnb.predict_proba(X_train_U)
y_pred = gnb.predict(X_train_U)
probs_class0 = probs[y_pred==0][:,0]
probs_class1 = probs[y_pred==1][:,1]
print("Accuracy iter 1 = ")
print(accuracy_score(y_test,gnb.predict(X_test)))
n = 30
new_labels = np.transpose(np.hstack([np.zeros(n,),np.ones(n,)]))
print(new_labels.shape)
topN_class0 = np.argsort(probs_class0)[-n:]
topN_class1 = np.argsort(probs_class1)[-n:]
top2N_class = np.transpose(np.hstack([topN_class0, topN_class1]))
print(top2N_class.shape)
# add these to the labeled set
# print(X_train_L.shape)
# print(X_train_U[topN_class0,:].shape)
# print(X_train_U[topN_class1,:].shape)
X_train_L = np.vstack([X_train_L,X_train_U[topN_class0,:],X_train_U[topN_class1,:]])

y_train_L = np.hstack([y_train_L,new_labels])

X_train_U = np.delete(X_train_U,top2N_class.T,0)
# print(X_train_U.shape)

# Iter 2

gnb.fit(X_train_L, y_train_L)
probs = gnb.predict_proba(X_train_U)
y_pred = gnb.predict(X_train_U)
probs_class0 = probs[y_pred==0][:,0]
probs_class1 = probs[y_pred==1][:,1]
print("Accuracy iter 2 = ")
print(accuracy_score(y_test,gnb.predict(X_test)))
n = 30
new_labels = np.transpose(np.hstack([np.zeros(n,),np.ones(n,)]))
print(new_labels.shape)
topN_class0 = np.argsort(probs_class0)[-n:]
topN_class1 = np.argsort(probs_class1)[-n:]
top2N_class = np.transpose(np.hstack([topN_class0, topN_class1]))
print(top2N_class.shape)
# add these to the labeled set
# print(X_train_L.shape)
# print(X_train_U[topN_class0,:].shape)
# print(X_train_U[topN_class1,:].shape)
X_train_L = np.vstack([X_train_L,X_train_U[topN_class0,:],X_train_U[topN_class1,:]])

y_train_L = np.hstack([y_train_L,new_labels])

X_train_U = np.delete(X_train_U,top2N_class.T,0)
print(X_train_U.shape)
print(X_train_L.shape)
print(y_train_L.shape)

# Iter 3

gnb.fit(X_train_L, y_train_L)
probs = gnb.predict_proba(X_train_U)
y_pred = gnb.predict(X_train_U)
probs_class0 = probs[y_pred==0][:,0]
probs_class1 = probs[y_pred==1][:,1]
print("Accuracy iter 3 = ")
print(accuracy_score(y_test,gnb.predict(X_test)))
n = 30
new_labels = np.transpose(np.hstack([np.zeros(n,),np.ones(n,)]))
print(new_labels.shape)
topN_class0 = np.argsort(probs_class0)[-n:]
topN_class1 = np.argsort(probs_class1)[-n:]
top2N_class = np.transpose(np.hstack([topN_class0, topN_class1]))
print(top2N_class.shape)
# add these to the labeled set
# print(X_train_L.shape)
# print(X_train_U[topN_class0,:].shape)
# print(X_train_U[topN_class1,:].shape)
X_train_L = np.vstack([X_train_L,X_train_U[topN_class0,:],X_train_U[topN_class1,:]])

y_train_L = np.hstack([y_train_L,new_labels])

X_train_U = np.delete(X_train_U,top2N_class.T,0)
print(X_train_U.shape)
print(X_train_L.shape)
print(y_train_L.shape)

# Iter 4

gnb.fit(X_train_L, y_train_L)
probs = gnb.predict_proba(X_train_U)
y_pred = gnb.predict(X_train_U)
probs_class0 = probs[y_pred==0][:,0]
probs_class1 = probs[y_pred==1][:,1]
print("Accuracy iter 4 = ")
print(accuracy_score(y_test,gnb.predict(X_test)))
n = 30
new_labels = np.transpose(np.hstack([np.zeros(n,),np.ones(n,)]))
print(new_labels.shape)
topN_class0 = np.argsort(probs_class0)[-n:]
topN_class1 = np.argsort(probs_class1)[-n:]
top2N_class = np.transpose(np.hstack([topN_class0, topN_class1]))
print(top2N_class.shape)
# add these to the labeled set
# print(X_train_L.shape)
# print(X_train_U[topN_class0,:].shape)
# print(X_train_U[topN_class1,:].shape)
X_train_L = np.vstack([X_train_L,X_train_U[topN_class0,:],X_train_U[topN_class1,:]])

y_train_L = np.hstack([y_train_L,new_labels])

X_train_U = np.delete(X_train_U,top2N_class.T,0)
print(X_train_U.shape)
print(X_train_L.shape)
print(y_train_L.shape)

# Iter 5

gnb.fit(X_train_L, y_train_L)
probs = gnb.predict_proba(X_train_U)
y_pred = gnb.predict(X_train_U)
probs_class0 = probs[y_pred==0][:,0]
probs_class1 = probs[y_pred==1][:,1]
print("Accuracy iter 5 = ")
print(accuracy_score(y_test,gnb.predict(X_test)))
n = 30
new_labels = np.transpose(np.hstack([np.zeros(n,),np.ones(n,)]))
print(new_labels.shape)
topN_class0 = np.argsort(probs_class0)[-n:]
topN_class1 = np.argsort(probs_class1)[-n:]
top2N_class = np.transpose(np.hstack([topN_class0, topN_class1]))
print(top2N_class.shape)
# add these to the labeled set
# print(X_train_L.shape)
# print(X_train_U[topN_class0,:].shape)
# print(X_train_U[topN_class1,:].shape)
X_train_L = np.vstack([X_train_L,X_train_U[topN_class0,:],X_train_U[topN_class1,:]])

y_train_L = np.hstack([y_train_L,new_labels])

X_train_U = np.delete(X_train_U,top2N_class.T,0)
print(X_train_U.shape)
print(X_train_L.shape)
print(y_train_L.shape)

Accuracy iter 1 = 
0.675324675325
(60,)
(60,)
Accuracy iter 2 = 
0.701298701299
(60,)
(60,)
(553, 8)
(140, 8)
(140,)
Accuracy iter 3 = 
0.753246753247
(60,)
(60,)
(496, 8)
(200, 8)
(200,)
Accuracy iter 4 = 
0.727272727273
(60,)
(60,)
(441, 8)
(260, 8)
(260,)
Accuracy iter 5 = 
0.701298701299
(60,)
(60,)
(383, 8)
(320, 8)
(320,)


In [None]:
from sklearn.naive_bayes import GaussianNB
class CoTrainSingleView:
    def __init__(self):
        gnb1 = GaussianNB()
        
    def fit_full(self,X_L,y_L,X_U):
        # fit on the labeled data
        gnb1.fit(X_L,y_L)
        
    
        
        
        