In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from copy import deepcopy
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report
from wordcloud import WordCloud, STOPWORDS
nltk.download('stopwords', quiet=True)
from PIL import Image
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

: 

In [None]:
train_df = pd.read_csv('train.csv')
train_df

: 

In [None]:
#Check vlaue counts for positive and negative tweets
train_df['label'].value_counts()

: 

In [None]:
#create a dictionary to store the words and its corresponding values
#convert words to 50 numbers using glove.6B.50d

words = dict()

def add_to_dict(d, filename):
    with open(filename, 'r', encoding='utf8') as f:
        for line in f.readlines():
            line = line.split(' ')
            
            try:
                d[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue

: 

In [None]:
add_to_dict(words, 'glove.6B.50d.txt')
words

: 

In [None]:
#check how many words in the dictionary

len(words)

: 

In [None]:
nltk.download('wordnet')

: 

In [None]:
#sample tokenize a tweet
tokenizer = nltk.RegexpTokenizer(r"\w+")

tokenizer.tokenize('@testing this is a sample tweet')

: 

In [None]:
#function that will replace group words into its simplest form
lemmatizer = WordNetLemmatizer()

def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t in words]
    
    return useful_tokens

: 

In [None]:
#confirm if lemmatization is correct
message_to_token_list('@testing these are tweets for sampling purposes')

: 

In [None]:
#function that will replace each words to its corresponding word embedding
def message_to_word_vectors(message, word_dict=words):
    processed_list_of_tokens = message_to_token_list(message)
    
    vectors = []
    
    for token in processed_list_of_tokens:
        if token not in word_dict:
            continue
        
        token_vector = word_dict[token]
        vectors.append(token_vector)
        
    return np.array(vectors, dtype=float)

: 

In [None]:
#dimension must be in the format (# of words in the tweet , # of dimensions)
#since we use glove.6B.50d, we expect # of dimensions = 50

message_to_word_vectors('@testing these are tweets for sampling purposes').shape

: 

In [None]:
#convert dataframe into train and test sets

train_df = train_df.sample(frac=1, random_state=1)
train_df.reset_index(drop=True, inplace=True)

split_index_1 = int(len(train_df) * 0.7)
split_index_2 = int(len(train_df) * 0.85)

train_df, val_df, test_df = train_df[:split_index_1], train_df[split_index_1:split_index_2], train_df[split_index_2:]

len(train_df), len(val_df), len(test_df)

: 

In [None]:
def df_to_X_y(dataframe):
    y = dataframe['label'].to_numpy().astype(int)
    
    all_word_vector_sequences = []
    
    for message in dataframe['tweet']:
        message_as_vector_seq = message_to_word_vectors(message)
        
        if message_as_vector_seq.shape[0] == 0: #if there are no usable tokens in the tweet
            message_as_vector_seq = np.zeros(shape=(1,50))
            
        all_word_vector_sequences.append(message_as_vector_seq)
    
    return all_word_vector_sequences, y

: 

In [None]:
X_train, y_train = df_to_X_y(train_df)

print(len(X_train), len(X_train[0]))

: 

In [None]:
#data analysis for sequence lengths (number of terms in the sequence)

sequence_lengths = []

for i in range(len(X_train)):
    sequence_lengths.append(len(X_train[i]))
    
plt.hist(sequence_lengths)

: 

In [None]:
#max value will give us the maximum number of useful tokens in a single message

pd.Series(sequence_lengths).describe()

: 

In [None]:
#zero padding

def pad_X(X, desired_sequence_length=57):
    X_copy = deepcopy(X)
    
    for i,x in enumerate(X):
        x_seq_len = x.shape[0]
        sequence_length_difference = desired_sequence_length - x_seq_len
        
        pad = np.zeros(shape=(sequence_length_difference, 50))
        
        X_copy[i] = np.concatenate([x, pad])
        
    return np.array(X_copy).astype(float)

: 

In [None]:
#show how many tweets we have and the dimension of each tweet
#the dimension of each tweet is in the form (sequences of # of vectors, # of dimensions = 50)

X_train = pad_X(X_train)
X_train.shape

: 

In [None]:
y_train.shape

: 

In [None]:
X_val, y_val = df_to_X_y(val_df)
X_val = pad_X(X_val)

X_val.shape, y_val.shape

: 

In [None]:
X_test, y_test = df_to_X_y(test_df)
X_test = pad_X(X_test)

X_test.shape, y_test.shape

: 

In [None]:
frequencies = pd.value_counts(train_df['label'])

frequencies

: 

In [None]:
# 0 = positive or neutral
# 1 = negative
weights = {0: frequencies.sum() / frequencies[0], 1: frequencies.sum() / frequencies[1]}

weights

: 

In [None]:
device = 'cpu'

y = torch.Tensor(y_train).to(device)
y

: 

In [None]:
x = (X_train - X_train.min()) / (X_train.max() - X_train.min())
x = torch.Tensor(x).to(device)
x

: 

In [None]:
x.shape

: 

In [None]:
y.shape

: 

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        
        self.hidden_layer_1 = nn.Linear(self.in_dim, 50) # input to first hidden layer
        self.hidden_layer_2 = nn.Linear(50, 10)
        
        self.multiple_layers = nn.Sequential(
            nn.Linear(10, 10),
            nn.Sigmoid(),
            nn.Linear(10, 10),
            nn.Sigmoid(),
            nn.Linear(10, 10),
            nn.Sigmoid(),
            nn.Linear(10, 10),
            nn.Sigmoid(),
        )
        
        self.output_layer = nn.Linear(10, self.out_dim)
        self.activation = nn.Sigmoid()
    
    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.activation(x)
        
        x = self.hidden_layer_2(x)
        x = self.activation(x)
        
        x = self.multiple_layers(x)
        
        y = self.output_layer(x)
        y = self.activation(y)
        
        return y

: 

In [None]:
model = NeuralNetwork(50,3).to(device)

# Test structure of model
predictions = model.forward(x)

predictions

: 

In [None]:
learning_rate = 0.00001
loss_fn = nn.MSELoss(reduce=None)
batch_size = 3
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

: 

In [None]:
class MyCustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    # Requires you to return data as a pair of _x, _y
    def __getitem__(self, index):
        return self.x[index], self.y[index]

: 

In [None]:
def train_fn(loader, model, optimizer, loss_fn):
    loop = tqdm(loader)
    
    count = 0
    ave_loss = 0.00
    
    # Loop per batch
    for batch_idx, (data,targets) in enumerate(loop):
        predictions = model.forward(data)
        
        loss = loss_fn(predictions, targets)
        
        optimizer.zero_grad()
        
        #loss.backward()
        
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
        
        ave_loss += loss.item()
        count += 1
        
    ave_loss = ave_loss / count
    
    return ave_loss

: 

In [None]:
predictions

: 

In [None]:
custom_dataset = MyCustomDataset(x=x, y=y)

train_loader = DataLoader(
    custom_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

epochs = 10
losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    
    ave_loss = train_fn(
        train_loader,
        model,
        optimizer,
        loss_fn
    )
    
    losses.append(ave_loss)
    
    print("Ave Loss: {}".format(ave_loss))
    
    state = { 'state_dict': model.state_dict() }

    torch.save(state, "model.pth")

: 

In [None]:
best_model = load_model('model/')

: 

In [None]:
test_predictions = (best_model.predict(X_test) > 0.5).astype(int)

print(classification_report(y_test, test_predictions))

: 

In [None]:
positive_tweet = train_df[train_df['label'] == 0]
positive_tweet

: 

In [None]:
negative_tweet = train_df[train_df['label'] == 1]
negative_tweet

: 

In [None]:
# Function to create Wordcloud
def wordcloud_generator(text,path):
    print("Generating Word Cloud...")
    stopwords = set(STOPWORDS)
    wc = WordCloud(background_color="black", max_words=3000, stopwords=stopwords, random_state=42, width=900, height=500, repeat=True)
    wc.generate(str(text))
    print(f"Saving Word Cloud. File Name: {path}")
    wc.to_file(path)
    print(f"Word Cloud Created and Saved to Local Disk!")
    path=path
    display = (Image.open(path))
    #display.show() #display the wordcloud

: 

Generate Wordcloud and save to local disk

In [None]:
#wordcloud for ALL tweets
plt.figure(figsize=(15, 8), dpi=80)
wordcloud_generator(train_df['tweet'].values,"wordcloud/all_tweets.png")

#wordcloud for positive tweets
plt.figure(figsize=(15, 8), dpi=80)
wordcloud_generator(positive_tweet['tweet'].values,"wordcloud/positive_tweets.png")

#wordcloud for negative tweets
plt.figure(figsize=(15, 8), dpi=80)
wordcloud_generator(negative_tweet['tweet'].values,"wordcloud/negative_tweets.png")

: 

In [None]:
#Add new column for the sentiment
sentiment = []

for row in train_df['label']:
    if row == 0:
        sentiment.append('Positive')
    elif row == 1:
        sentiment.append('Negative')
        
    else:
        sentiment.append('None')
        
train_df['sentiment'] = sentiment

train_df

: 

In [None]:
data = train_df['sentiment'].value_counts()
ax = data.plot(kind='pie', autopct='%1.1f%%', explode=[0.05, 0.05], legend=True, title='Positive vs Negative Sentiment', ylabel='')
ax.legend(bbox_to_anchor=(1,1.02), loc='upper left')
plt.show()


: 