Semi-Supervised Classification Dataset

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# define the dataset
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, random_state=42)

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42, stratify=y)

# split train into labelled and unlabelled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.50, random_state=42, stratify=y_train)

# define model
model = LogisticRegression()

# fit model on labelled dataset
model.fit(X_train_lab, y_train_lab)

# make predictions
yhat = model.predict(X_test)

# calculate tscore for the test
score = accuracy_score(y_test, yhat)

# summarize score
print('Accuracy: %.3f' % (score*100))

Accuracy: 88.600


Label Propagation for Semi-Supervised Learning

In [5]:
from numpy import concatenate
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelPropagation

# define the dataset
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, random_state=42)

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42, stratify=y)

# split train into labelled and unlabelled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.50, random_state=42, stratify=y_train)

# create training dataset
X_train_mixed = concatenate((X_train_lab, X_test_unlab))

# create "no label" for unlabelled data
nolabel = [-1 for _ in range(len(y_test_unlab))]

# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))

# define model
model = LabelPropagation()

# fit model on training data
model.fit(X_train_mixed, y_train_mixed)

# make prediction on test set
yhat = model.predict(X_test)

# calculate score for test set
score = accuracy_score(y_test, yhat)

# summarize score 
print('Accuracy: %.3f' % (score*100))

Accuracy: 92.800


Alternate Approach - Taking estimated labels for training dataset and fit a supervised learning model

In [6]:
from numpy import concatenate
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelPropagation
from sklearn.linear_model import LogisticRegression

# define the dataset
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, random_state=42)

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42, stratify=y)

# split train into labelled and unlabelled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.50, random_state=42, stratify=y_train)

# create training dataset
X_train_mixed = concatenate((X_train_lab, X_test_unlab))

# create "no label" for unlabelled data
nolabel = [-1 for _ in range(len(y_test_unlab))]

# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))

# define model
model = LabelPropagation()

# fit model on training data
model.fit(X_train_mixed, y_train_mixed)

# get labels for entire training dataset data
trans_labels = model.transduction_

# define supervised learning model
model2 = LogisticRegression()

# fit supervised learning model on entire training dataset
model2.fit(X_train_mixed, trans_labels)

# make predictions on test set
yhat = model2.predict(X_test)

# calculate score for test set
score = accuracy_score(yhat, y_test)

# summarize score
print('Accuracy: %.3f' % (score*100))

Accuracy: 88.400
