## Importing the libraries

In [1]:
import numpy as np 
import pandas as pd
import plotly.express as px
import unidecode

from tensorflow import keras
from keras.preprocessing.text import text_to_word_sequence

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading , LabelPropagation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,accuracy_score
from simcse import SimCSE
import re
import string
import nltk 
from sklearn.linear_model import SGDClassifier
import pandas as pd
import torch

## Importing the dataset

In [2]:
train = pd.read_csv("dataset/clean_train.csv", encoding='latin-1')
test = pd.read_csv("dataset/clean_test.csv", encoding='latin-1')

In [3]:
print(train.shape)
print(test.shape)

(41157, 6)
(41157, 6)


In [4]:
train.head()

Unnamed: 0,UserName,ScreenName,TweetAt,OriginalTweet,Sentiment,label
0,3799,48751,16-03-2020,menyrbi phil gahan chrisitv http ifzfanpa http...,Neutral,0
1,3800,48752,16-03-2020,advic talk neighbour famili exchang phone numb...,Positive,1
2,3801,48753,16-03-2020,coronaviru australia woolworth elderli disabl ...,Positive,1
3,3802,48754,16-03-2020,food stock dont panic food need stay calm stay...,Positive,1
4,3803,48755,16-03-2020,readi supermarket covid outbreak im paranoid f...,Extremely Negative,2


In [5]:
train = train.drop(columns = ['TweetAt','ScreenName','UserName','label'])
train.head()

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbi phil gahan chrisitv http ifzfanpa http...,Neutral
1,advic talk neighbour famili exchang phone numb...,Positive
2,coronaviru australia woolworth elderli disabl ...,Positive
3,food stock dont panic food need stay calm stay...,Positive
4,readi supermarket covid outbreak im paranoid f...,Extremely Negative


In [6]:
test = test.drop(columns = ['TweetAt','ScreenName','UserName','label'])
test.head()

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbi phil gahan chrisitv http ifzfanpa http...,Neutral
1,advic talk neighbour famili exchang phone numb...,Positive
2,coronaviru australia woolworth elderli disabl ...,Positive
3,food stock dont panic food need stay calm stay...,Positive
4,readi supermarket covid outbreak im paranoid f...,Extremely Negative


In [7]:
train['Sentiment'].unique(), train['Sentiment'].nunique()

(array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
        'Extremely Positive'], dtype=object),
 5)

> **Sentiment categorize in 5 types, so basically we assign a number to each type so we insight our data.**

In [8]:
labels = {'Sentiment': {'Extremely Negative': 1, 'Negative': 2,
                        'Neutral': 3, 'Positive':4, 'Extremely Positive':5}}
train.replace(labels , inplace = True)
test.replace(labels , inplace = True)

In [9]:
train.head()

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbi phil gahan chrisitv http ifzfanpa http...,3
1,advic talk neighbour famili exchang phone numb...,4
2,coronaviru australia woolworth elderli disabl ...,4
3,food stock dont panic food need stay calm stay...,4
4,readi supermarket covid outbreak im paranoid f...,1


In [10]:
test.head()

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbi phil gahan chrisitv http ifzfanpa http...,3
1,advic talk neighbour famili exchang phone numb...,4
2,coronaviru australia woolworth elderli disabl ...,4
3,food stock dont panic food need stay calm stay...,4
4,readi supermarket covid outbreak im paranoid f...,1


> **So I labeled the data as:**
* 1 - Extremely Negative
* 2 - Negative
* 3 - Neutral
* 4 - Positive
* 5 - Extremely Positive


## data cleaning

In [11]:
# removing emails from the text
def remove_emails(text):
    text = re.sub('\S*@\S*\s?', '', text)
    return text

# remove duplicate spaces and new lines
def remove_spaces(text):
    text = [" ".join(re.split("\s+", word, flags=re.UNICODE)) for word in text]
    return text

# removing punctuations
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text


    
# tokenization
def tokenization(text):
    text = re.split('\W+', text)
    return text

# removing stopwords
def remove_stopwords(text):
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopword]
    return text

# texting Lemmitization
def lemmatizer(text):
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in text]
    return text

In [12]:
def clean_text(text):
    text = str(text)
    # convert the text to lowercase
    text_lower = text.lower() 
    
    # removing emails from the text
    text_without_emails = remove_emails(text_lower)
    
    # remove duplicate spaces and new lines
    text_without_spaces = remove_spaces(text_without_emails)
    
    # remove punctuation
    text_without_punctuations = remove_punct(text_without_spaces)
    
    # text tokenization
    tokens = tokenization(text_without_punctuations)   
    
    # removing stopwords 
    text_without_stopwords = remove_stopwords(tokens)
    
    # text Lemmitization
    text_clean = lemmatizer(text_without_stopwords)
    
    return " ".join(text_clean)

In [13]:
train['OriginalTweet'][2]

'coronaviru australia woolworth elderli disabl dedic shop hour amid covid outbreak http bincavpp'

In [14]:
train.dtypes

OriginalTweet    object
Sentiment         int64
dtype: object

In [15]:
train['OriginalTweet'] = train['OriginalTweet'].map(clean_text)
test['OriginalTweet'] = test['OriginalTweet'].map(clean_text)

In [16]:
train.head()

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbi phil gahan chrisitv http ifzfanpa http...,3
1,advic talk neighbour famili exchang phone numb...,4
2,coronaviru australia woolworth elderli disabl ...,4
3,food stock dont panic food need stay calm stay...,4
4,readi supermarket covid outbreak im paranoid f...,1


## Splitting the dataset

In [17]:
X_train = list(train.OriginalTweet)
X_test = list(test.OriginalTweet)
y_train = list(train.Sentiment)
y_test = list(test.Sentiment)

## Creating the embeddings model and transforming the data

In [18]:
# loading SimCSE embeddings model
embeddings_model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")

def encoding_with_embeddings(model,data):
    embeddings = model.encode(data)
    return embeddings
X_train = encoding_with_embeddings(embeddings_model,X_train)
X_test = encoding_with_embeddings(embeddings_model,X_test)

100%|█████████████████████████████████████████| 644/644 [06:06<00:00,  1.76it/s]
100%|█████████████████████████████████████████| 644/644 [06:30<00:00,  1.65it/s]


## Creating unlabeled data

In [19]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [20]:
# select a mask of 20% of the train dataset
mask_percentage = 0.2
y_mask = np.random.rand(len(y_train)) < mask_percentage

y_train_unlabeled = y_train
# set the non-masked subset to be unlabeled
y_train_unlabeled[~y_mask] = -1

print(f"LabelSpreading on {mask_percentage*100}% of the data (rest is unlabeled):")

# X_20 and y_20 are the subset of the train dataset indicated by the mask
X_20 = []
y_20 = []
for x, y in zip(X_test, y_train):
    if y != -1:
        X_20.append(x.tolist())
        y_20.append(y.tolist())
X_20 = torch.Tensor(X_20)
y_20 = torch.Tensor(y_20)

LabelSpreading on 20.0% of the data (rest is unlabeled):


## creating the machine learning models

In [21]:
# Label Propagation semi-supervised Model
lp_model1 = LabelPropagation(kernel='knn')
lp_model2 = LabelPropagation(kernel='rbf')




# Label Spreading semi-supervised Model
ls_model1 = LabelSpreading(kernel='knn')
ls_model2 = LabelSpreading(kernel='rbf')


# SGD supervised model
sgd_model = SGDClassifier(alpha=1e-5, penalty="l2", loss="log")

## Evaluating the models

In [22]:
# function for training and evaluating each model
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    
    # training the model
    clf.fit(X_train, y_train)
    
    # making predictions
    y_pred = clf.predict(X_test)
    print(
        "accuracy score on test set: %0.3f"
        % accuracy_score(y_test, y_pred)
    )
    print("-" * 10)
    print()

### Label Propagation semi-supervised Model

In [None]:
eval_and_print_metrics(lp_model1, X_train, y_train_unlabeled, X_test, y_test)

In [None]:
eval_and_print_metrics(lp_model2, X_train, y_train_unlabeled, X_test, y_test)

### Label Spreading semi-supervised Model

In [None]:
eval_and_print_metrics(ls_model1, X_train, y_train_unlabeled, X_test, y_test)

In [None]:
eval_and_print_metrics(ls_model2, X_train, y_train_unlabeled, X_test, y_test)

### SGD supervised Model

In [None]:
# SGD Model trained on labeled 20% part of the dataset
eval_and_print_metrics(sgd_model, X_20, y_20, X_test, y_test)

In [None]:
# SGD Model trained on the full dataset
eval_and_print_metrics(sgd_model, X_train, y_train, X_test, y_test)