## Importing libraries

In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading , LabelPropagation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,accuracy_score
from simcse import SimCSE
import re
import string
import nltk 
from sklearn.linear_model import SGDClassifier
import pandas as pd
import torch


## Importing dataset

In [2]:
# Loading dataset containing first five categories
data = fetch_20newsgroups(
    subset="train",
    categories=[
        "alt.atheism",
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware",
        "comp.sys.mac.hardware",
    ],
)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

2823 documents
5 categories



In [3]:
X, y = data.data, data.target

## data cleaning

In [4]:
# removing emails from the text
def remove_emails(text):
    text = re.sub('\S*@\S*\s?', '', text)
    return text

# remove duplicate spaces and new lines
def remove_spaces(text):
    text = [" ".join(re.split("\s+", word, flags=re.UNICODE)) for word in text]
    return text

# removing punctuations
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text


    
# tokenization
def tokenization(text):
    text = re.split('\W+', text)
    return text

# removing stopwords
def remove_stopwords(text):
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopword]
    return text

# texting Lemmitization
def lemmatizer(text):
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in text]
    return text

In [5]:
def clean_text(text):
    # convert the text to lowercase
    text_lower = text.lower() 
    
    # removing emails from the text
    text_without_emails = remove_emails(text_lower)
    
    # remove duplicate spaces and new lines
    text_without_spaces = remove_spaces(text_without_emails)
    
    # remove punctuation
    text_without_punctuations = remove_punct(text_without_spaces)
    
    # text tokenization
    tokens = tokenization(text_without_punctuations)   
    
    # removing stopwords 
    text_without_stopwords = remove_stopwords(tokens)
    
    # text Lemmitization
    text_clean = lemmatizer(text_without_stopwords)
    
    return " ".join(text_clean)

In [6]:
X = [clean_text(x) for x in X]
X[0]

'brian v hughes subject new apple ergomouse replyto organization dartmouth college hanover nh disclaimer personally really dont care think speak moderator recartscomicsinfo line schizophrenia mean never alone writes anyone know open apple ergomouse adb mouse ii mine life near cat true really pick fur tell look like apple welded shut must tried hard opend mine second take look bottom dial turn open much like older adb mouse used bit harder turn first quite simple open also anyone know installing fpus mac lc iii ive heard people saying fried motherboard lc iii well dont match pin correctly problem close look socket give idea proper orientation chip hades '

## Creating the embeddings model and transforming the data

In [7]:
# loading SimCSE embeddings model
embeddings_model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")

def encoding_with_embeddings(model,data):
    embeddings = model.encode(data)
    return embeddings
X = encoding_with_embeddings(embeddings_model,X)

  return torch._C._cuda_getDeviceCount() > 0
100%|███████████████████████████████████████████| 45/45 [04:54<00:00,  6.54s/it]


## Splitting the dataset

### Spliting into training data and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Creating unlabeled data

In [9]:
# select a mask of 20% of the train dataset
mask_percentage = 0.2
y_mask = np.random.rand(len(y_train)) < mask_percentage


# set the non-masked subset to be unlabeled
y_train[~y_mask] = -1

print(f"LabelSpreading on {mask_percentage*100}% of the data (rest is unlabeled):")

# X_20 and y_20 are the subset of the train dataset indicated by the mask
X_20 = []
y_20 = []
for x, y in zip(X_test, y_train):
    if y != -1:
        X_20.append(x.tolist())
        y_20.append(y.tolist())
X_20 = torch.Tensor(X_20)
y_20 = torch.Tensor(y_20)

LabelSpreading on 20.0% of the data (rest is unlabeled):


## creating the machine learning models

In [10]:
# Label Propagation semi-supervised Model
lp_model1 = LabelPropagation(gamma=10)
lp_model2 = LabelPropagation(gamma=20)
lp_model3 = LabelPropagation(gamma=30)
lp_model4 = LabelPropagation(gamma=40)
lp_model5 = LabelPropagation(gamma=50)



# Label Spreading semi-supervised Model
ls_model1 = LabelSpreading(gamma=10)
ls_model2 = LabelSpreading(gamma=20)
ls_model3 = LabelSpreading(gamma=30)
ls_model4 = LabelSpreading(gamma=40)
ls_model5 = LabelSpreading(gamma=50)

# SGD supervised model
sgd_model = SGDClassifier(alpha=1e-5, penalty="l2", loss="log")

## Evaluating the models

In [11]:
# function for training and evaluating each model
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    
    # training the model
    clf.fit(X_train, y_train)
    
    # making predictions
    y_pred = clf.predict(X_test)
    print(
        "accuracy score on test set: %0.3f"
        % accuracy_score(y_test, y_pred)
    )
    print("-" * 10)
    print()

### Label Propagation semi-supervised Model

In [12]:
eval_and_print_metrics(lp_model1, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.649
----------



In [13]:
eval_and_print_metrics(lp_model2, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.707
----------





In [14]:
eval_and_print_metrics(lp_model3, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.707
----------





In [15]:
eval_and_print_metrics(lp_model4, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.698
----------





In [16]:
eval_and_print_metrics(lp_model5, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.693
----------





### Label Spreading semi-supervised Model

In [17]:
eval_and_print_metrics(ls_model1, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.680
----------



In [18]:
eval_and_print_metrics(ls_model2, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.711
----------



In [19]:
eval_and_print_metrics(ls_model3, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.684
----------



In [20]:
eval_and_print_metrics(ls_model4, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.664
----------



In [21]:
eval_and_print_metrics(ls_model5, X_train, y_train, X_test, y_test)

Number of training samples: 2117
Unlabeled samples in training set: 1667
accuracy score on test set: 0.656
----------



### SGD supervised Model

In [22]:
eval_and_print_metrics(sgd_model, X_20, y_20, X_test, y_test)

Number of training samples: 137
Unlabeled samples in training set: 0
accuracy score on test set: 0.217
----------

