# Semisupervised Learning

## Label Propogation Algorithm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelPropagation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data.csv', low_memory=False)
df = df.drop('SessionLabel2', axis=1)
df = df.drop(columns=df.columns[:3])
df

In [None]:
df['SessionLabel'].value_counts()

In [None]:
y = df['SessionLabel'].to_numpy()
X = df[['ACC_mean','BVP_mean','EDA_mean','TEMP_mean']].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_labelled, X_unlabelled, y_labelled, y_unlabelled = train_test_split(X_train, y_train, test_size=0.50, random_state=1, stratify=y_train)

In [None]:
# summarize training set size
print('Labelled Train Set:', X_labelled.shape, y_labelled.shape)
print('Unlabelled Train Set:', X_unlabelled.shape, y_unlabelled.shape)
# summarize test set size
print('Test Set:', X_test.shape, y_test.shape)

In [None]:
X_train_mixed = np.concatenate((X_labelled, X_unlabelled))
# create "nolabel" array for unlabeled data (which only stores -1
# i.e our unlabelled data will not store 0 or 1 but will store a random number (here -1 is used))
nolabel = [-1 for _ in range(len(y_unlabelled))]
# recombine training dataset labels
y_train_mixed = np.concatenate((y_labelled, nolabel))
# define model
model = LabelPropagation(kernel='knn', n_neighbors=7, n_jobs=-1)

In [None]:
X_train_mixed.shape

In [None]:
len(y_train_mixed[y_train_mixed==1]),len(y_train_mixed[y_train_mixed==2]),len(y_train_mixed[y_train_mixed==0])

In [None]:
# fit model on training dataset
hist = model.fit(X_train_mixed, y_train_mixed)
# get labels for entire training dataset data
pseudo_labels = model.transduction_

In [None]:
y_check = np.concatenate((y_labelled, y_unlabelled))

In [None]:
confusion_matrix(y_check,pseudo_labels)

In [None]:
print(classification_report(y_check,pseudo_labels))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# define supervised learning model
model2 = RandomForestClassifier(max_depth=2, random_state=0)
# fit supervised learning model on entire new augmented training dataset (i.e. using pseudo_labels)
model2.fit(X_train_mixed, pseudo_labels)
# make predictions on hold out test set
yhat = model2.predict(X_test)
# calculate score for test set
score = accuracy_score(y_test, yhat)
# summarize score
print('Accuracy: %.3f' % (score*100))