In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer,  CountVectorizer, IDF
import os, sys, re, uuid, time, nltk, pandas as pd, numpy as np, sklearn
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
from gc import collect;
from IPython.display import clear_output;
import nltk;
dler = nltk.downloader.Downloader();
dler._update_index();
nltk.download('wordnet');

clear_output();
for i in range(3): collect(i);

In [None]:
!pip install pyspark

In [None]:
class Config:
    TRAIN_SIZE = 0.7
    MAX_NB_WORDS = 100000
    MAX_SEQUENCE_LENGTH = 25
    PATH = "/kaggle/input/preprocessed-sentiment/preprocessed.csv"
    NUM_EPOCHS = 10
    HIDDEN_SIZE = 128
    OUTPUT_SIZE = 1
    BATCH_SIZE = 64    
    LR = 0.001
    LSTM_LAYERS = 5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def METRIC(outputs, labels):
        """
        Binary classification Accuracy Metric.
        """
        return (outputs.argmax(1) == labels).type(torch.float).sum().item()

class TextPreprocessor:

    def __init__(self):
        pass

    def text_cleaning2(text: str) -> str:
        text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
        return re.sub(text_cleaning_re, " ", text.lower()).strip()

    def text_cleaning(text: str) -> str:
        """
        Cleans the text.
        """
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'bit.ly/\S+', '', text)
        text = text.strip('[link]')
        # remove users
        text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        # remove puntuation
        my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@â'
        text = re.sub('[' + my_punctuation + ']+', ' ', text)
        # remove number
        text = re.sub('([0-9]+)', '', text)
        # remove hashtag
        text = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        return text.lower()
        
    def remove_stopwords(text: str) -> str:
        """
        Removes the stopwords from the text.
        """
        STOP_WORDS = stopwords.words('english')
        return ' '.join([word for word in text.split() if word not in STOP_WORDS])

    def stemming(text: str) -> str:
        """
        Stems the text.
        """
        stemmer = SnowballStemmer("english")
        return ' '.join([stemmer.stem(word) for word in text.split()])

    def lemmatization(text: str) -> str:
        """
        Lemmatization is the process of grouping together the inflected forms of a word.
        Parameters:
            text: str
        """
        lemmatizer = WordNetLemmatizer()
        return ' '.join([lemmatizer.lemmatize(word, 'v') for word in text.split()])

    def tokenize(df: DataFrame, i_p: StringType) -> DataFrame:
        """
        Tokenizes the text.
        Parameters:
            df: DataFrame
        """
        token = Tokenizer(inputCol=i_p, outputCol="words")
        return token.transform(df)

    def preprocess_text(text: str, stem=True) -> str:
        """
        Preprocesses the text.
        Parameters:
            text: the text to preprocess.
            stem: if True, stems the text.
            else, lemmatizes the text.
        """
        if stem:
            return TextPreprocessor.stemming(TextPreprocessor.remove_stopwords(TextPreprocessor.text_cleaning2(text)))
        else:
            return TextPreprocessor.lemmatization(TextPreprocessor.remove_stopwords(TextPreprocessor.text_cleaning2(text)))

In [None]:
class Preprocessing:
    """
    This class was created to preprocess the Pandas dataframe
    Instead of PySpark because PySpark dataframe when converted to Pandas
    Dataframe, there were missing values.
    """
    def __init__(self):
        """
        Initializes the preprocessing class.
        """
        self.path = Config.PATH
        self.len = Config.MAX_SEQUENCE_LENGTH
        self.max_words = Config.MAX_NB_WORDS
        self.train_size = Config.TRAIN_SIZE
        self.df = self.load_data()

    def load_data(self):
        """
        Loads the data.
        """
        return pd.read_csv(self.path,encoding='latin-1')


    def preprocesss(self):
        """
        Preprocesses the data.
        """
        self.df['cleaned'] = self.df.text.apply(TextPreprocessor.preprocess_text, args=(False,)) #stem = False
        self.target = self.df.y.to_numpy()
        self.X = self.df['cleaned'].to_numpy()
        #self.df.drop(['text', 'y', 'cleaned_text'], axis=1, inplace=True)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.target, 
                                                                                test_size=1-self.train_size, 
                                                                                random_state=42)
        print("Done preprocessing.")

    def prepare_tokens(self):
        """
        Prepares the tokens.
        """
        self.tokenizer = Tokenizer(oov_token='UNK')
        self.tokenizer.fit_on_texts(self.X_train)

    def get_tokenizer(self):
        """
        Returns the tokenizer.
        """
        return self.tokenizer
    
    def sequence_to_token(self):
        """
        Converts the text to sequences.
        """
        self.X_train = self.tokenizer.texts_to_sequences(self.X_train)
        self.X_test = self.tokenizer.texts_to_sequences(self.X_test)
        
    def padding(self):
        """
        Pads the sequences.
        """
        self.X_train = pad_sequences(self.X_train, maxlen=self.len, padding='post')
        self.X_test = pad_sequences(self.X_test, maxlen=self.len, padding='post')

    def adjust_outputs(self):
        """
        Adjusts the outputs.
        """
        self.y_train = self.y_train.reshape(-1, 1)
        self.y_test = self.y_test.reshape(-1, 1)

    def text2seq(self):
        """
        Converts the text to sequences.
        """
        self.preprocesss()
        self.prepare_tokens()
        self.sequence_to_token()
        self.padding()
        self.adjust_outputs()
        print("Done text2seq.")

    def get_data(self):
        """
        Returns the data.
        """
        return self.X_train, self.X_test, self.y_train, self.y_test

In [None]:
import nltk

!python3 -m nltk.downloader wordnet

In [None]:
import warnings
warnings.filterwarnings('ignore')

def load_data(path, schema, CSV=True, provide_header=False):
    """
    Loads the data from the given path.
    """
    if CSV:
        if provide_header:
            df = sc.read.format("csv")\
                .option("header", "false")\
                .schema(schema)\
                .load(path)
        else:
            df = sc.read.csv(path, header=True, inferSchema=True)
    else:
        df = sc.read.json(path)
    df.persist()
    return df

def show_df(df: DataFrame)-> DataFrame:
    """
    Shows the dataframe.
    """
    return df.show()

def describe_data(df):
    """
    Prints the dataframe's statistics.
    """
    return(df.describe().show())

def check_nan(df: DataFrame) -> DataFrame:
    """
    Checks for NaN values in the dataframe.
    """
    return(df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show())

def check_nunique(df: DataFrame)-> DataFrame:
    """
    Checks the number of unique values in the dataframe.
    """
    return(df.select([count(when(c.isNotNull(), c)).alias(c) for c in df.columns]).show())

def drop_columns(df: DataFrame, cols: list) -> DataFrame:
    """
    Drops the columns that are not needed.
    """
    return df.drop(*cols)

def map_label(y):
    """
    Maps the label to nicer presentation
    """
    return 0 if y == 0 else 1 if y == 4 else 2

def split(df: DataFrame, train_size: float, valid_size:float, test_size:float) -> DataFrame:
    """
    Splits the dataframe into train and test sets.
    """
    assert (train_size + valid_size + test_size) == 1.0, "The sum of train, valid and test sizes must be 1.0"
    return df.randomSplit([train_size, valid_size, test_size], seed=42)

def Preprocessing(df: DataFrame, stem: bool=True) -> DataFrame:
    """
    Preprocesses the dataframe.
    """
    STOP_WORDS = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    #text = df.select("text")
    def text_cleaning2(text: str) -> str:
        text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
        return re.sub(text_cleaning_re, " ", text.lower()).strip()


    def text_cleaning(text: str) -> str:
        """
        Cleans the text.
        """
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'bit.ly/\S+', '', text)
        text = text.strip('[link]')

        # remove users
        text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)

        # remove puntuation
        my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@â'
        text = re.sub('[' + my_punctuation + ']+', ' ', text)

        # remove number
        text = re.sub('([0-9]+)', '', text)

        # remove hashtag
        text = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        return text.lower()
        
    def remove_stopwords(text: str) -> str:
        """
        Removes the stopwords from the text.
        """
        return ' '.join([word for word in text.split() if word not in STOP_WORDS])

    def stemming(text: str) -> str:
        """
        Stems the text.
        """
        return ' '.join([stemmer.stem(word) for word in text.split()])

    def lemmatization(text: str) -> str:
        """
        Lemmatization is the process of grouping together the inflected forms of a word.
        Parameters:
            text: str
        """
        return ' '.join([lemmatizer.lemmatize(word, 'v') for word in text.split()])

    def tokenize(df: DataFrame, i_p: StringType) -> DataFrame:
        """
        Tokenizes the text.
        Parameters:
            df: DataFrame
        """
        token = Tokenizer(inputCol=i_p, outputCol="words")
        return token.transform(df)

    def preprocess_text(text: str, stem: bool=False) -> str:
        """
        Preprocesses the text.
        Parameters:
            text: the text to preprocess.
            stem: if True, stems the text.
        """
        if stem:
            statement = stemming(remove_stopwords(text_cleaning2(text)))
            return statement
        else:
            statement = lemmatization(remove_stopwords(text_cleaning2(text)))
            return statement

    cleaned_text = udf(lambda x: preprocess_text(x), StringType())
    df = df.withColumn("cleaned_text", cleaned_text("text"))
    return df

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
PATH = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
print("Starting the program...")
beginning = time.time()
sc = SparkSession.builder.master("local[*]").appName("Sentiment Analysis").getOrCreate()
schema = StructType([\
    StructField("y", IntegerType(), True),\
    StructField("ids", IntegerType(), True),\
    StructField("date", StringType(), True),\
    StructField("flag", StringType(), True),\
    StructField("user", StringType(), True),\
    StructField("text", StringType(), True)])
y_schema = StructType([\
    StructField("y", IntegerType(), True)])
DROPPED_COLS = ['ids', 'date', 'flag', 'user']
df = load_data(PATH, schema, provide_header=True)
df = drop_columns(df, DROPPED_COLS)
df = Preprocessing(df, True)
label = udf(lambda x: map_label(x), IntegerType())
df = df.withColumn("y", label("y"))
final_df = df.toPandas()
final_df.to_csv("preprocessed.csv", index=False)
print("The program has finished. Preprocessing took {} seconds".format(time.time() - beginning))

In [None]:
class Classifier(nn.Module):
    def __init__(self, input_size):
        """
        Initializes the classifier's parameters..
        """
        super().__init__()
        self.input_size = input_size #vocab_size
        self.hidden_dim = Config.HIDDEN_SIZE
        self.output_size = Config.OUTPUT_SIZE
        self.LSTM_layers = Config.LSTM_LAYERS
        self.dropout = nn.Dropout(0.5)
        self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
        self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=128)
        self.fc2 = nn.Linear(128, self.output_size)

    def forward(self, x):
        """
        Forward pass.
        """
        h0 = torch.zeros(self.LSTM_layers, x.size(0), self.hidden_dim, device=x.device).float()
        c0 = torch.zeros(self.LSTM_layers, x.size(0), self.hidden_dim, device=x.device).float()
        # h0 = torch.zeros(self.LSTM_layers, x.size(0), self.hidden_dim).float()
        # c0 = torch.zeros(self.LSTM_layers, x.size(0), self.hidden_dim).float()
        out = self.embedding(x)
        out, _ = self.lstm(out, (h0,c0))
        out = torch.relu_(self.fc1(out[:,-1,:]))
        out = torch.sigmoid(self.fc2(out))       
        return out


In [None]:
class Executing:
    """
    The Execution Class
    """
    def __init__(self):
        """
        Initializes the executing class.
        """
        self.batch_size = Config.BATCH_SIZE
        self.epochs = Config.NUM_EPOCHS
        self.lr = Config.LR
        self.metric = Config.METRIC

    def on_epoch_start(self, epoch):
        print(f'Epoch {epoch+1}/{self.epochs}')

    def prepare_batches(self):
        """
        Prepares the batches.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = self.df.get_data()
        self.X_train = torch.tensor(self.X_train, dtype=torch.long)
        self.y_train = torch.tensor(self.y_train, dtype=torch.float32)
        self.X_test = torch.tensor(self.X_test, dtype=torch.long)
        self.y_test = torch.tensor(self.y_test, dtype=torch.float32)
        self.train_dataset = torch.utils.data.TensorDataset(self.X_train, self.y_train)
        self.test_dataset = torch.utils.data.TensorDataset(self.X_test, self.y_test)
        self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size)
        self.test_loader = torch.utils.data.DataLoader(self.test_dataset, shuffle=True)

    def prepare_data(self):
        """
        Prepares the data.
        """
        start = time.time()
        self.df = Preprocessing()
        self.df.text2seq()
        tokenizer = self.df.get_tokenizer()
        self.input_size = len(tokenizer.word_index) + 1
        self.model = Classifier(self.input_size)
        self.prepare_batches()
        print("Done preparing data, done in {:.2f} seconds".format(time.time() - start))

    def fit(self):
        """
        Trains the model.
        """
        self.prepare_data()
        self.model.to(Config.DEVICE)
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.history = []
        for epoch in range(0, self.epochs):
            start = time.time()    
            self.model.train()
            self.on_epoch_start(epoch)
            train_loss, train_acc, = 0, 0
            val_loss, val_acc = 0, 0
            for x, y in self.train_loader:
                trainSteps = len(self.train_loader.dataset) // self.batch_size
                x = x.to(Config.DEVICE)
                y = y.to(Config.DEVICE)
                y_pred = self.model(x)
                loss = criterion(y_pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                train_loss += loss
                train_acc += self.metric(y_pred, y)
                with torch.no_grad():
                    self.model.eval()
                    for x_test, y_test in self.test_loader:
                        testSteps = len(self.test_loader.dataset) // self.batch_size
                        x_test = x_test.to(Config.DEVICE)
                        y_test = y_test.to(Config.DEVICE)
                        pred_test = self.model(x_test)
                        val_acc += self.metric(pred_test, y_test)
                        val_loss += criterion(pred_test, y_test)
            valCorrect = val_acc / len(self.test_loader.dataset)
            avgValLoss = val_loss / testSteps
            avgTrainLoss = train_loss / trainSteps
            trainCorrect = train_acc / len(self.train_loader.dataset)
            his = {'train_loss': avgTrainLoss, 'train_accuracy': trainCorrect, 'val_loss': avgValLoss, 'val_accuracy': valCorrect}
            self.history.append(his)
            self.on_epoch_end(his)
            print(f'Epoch {epoch+1} done in {time.time() - start:.2f} seconds')
            print("-"*10)

    def on_epoch_end(self, logs):
        """
        Prints the logs.
        """
        print(f'train_loss: {logs["train_loss"]:.2f}, train_accuracy: {logs["train_accuracy"]:.2f}')
        print(f'val_loss: {logs["val_loss"]:.2f}, val_accuracy: {logs["val_accuracy"]:.2f}')
        
        
    def get_history(self):
        """
        Returns the history.
        """
        return self.history

    def get_model(self):
        """
        Returns the model.
        """
        return self.model

In [None]:
nltk.download('all')
print("Start training process.")
start = time.time()
execute = Executing()
execute.fit()
end = time.time()
print(f'Time taken for training: {end-start}.')