In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read the Data

In [None]:
train_set = pd.read_csv("../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
# Veri setindeki pozitif ve negatif duyguları eşit almak amacıyla yapılmış bir işlem.
top_data_df_positive = train_set[train_set['label'] == 0].head(15000)
top_data_df_negative = train_set[train_set['label'] == 1].head(15000)

train = pd.concat([top_data_df_positive, top_data_df_negative])


# Tokenization

In [None]:

from gensim.utils import simple_preprocess
train['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in train['text']]

# Stemming
This part is for streaming the words. This process is for removing the commoner morphological and inflexional endings from words in English.

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
train['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in train['tokenized_text']]
train


# Train - Test Split 
Here, seperation of dataset as train and test. %70 is going to be train, rest of them will be test data.

In [None]:
from sklearn.model_selection import train_test_split
# Train Test Split Function
def split_train_test(train, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(train[['stemmed_tokens']], 
                                                        train['label'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(train)

# Word2Vec Model Creation
This part calls the function create word vectors with word2vec method.


In [None]:
from gensim.models import Word2Vec
# embedding vector size.
size = 500
# size of look for how many words around the selected word.
window = 3
# occurence of the word in order to take a place in word vector dict.
min_count = 1
workers = 3
# 0 for CBOW, 1 for skip-gram
sg = 0
OUTPUT_FOLDER = '/kaggle/working/'
# Function to train word2vec model
def make_word2vec_model(train, padding, sg, min_count, size, workers, window):
    if  padding:
        #print(len(train))
        temp_df = pd.Series(train['stemmed_tokens']).values
        temp_df = list(temp_df)
        temp_df.append(['pad'])
        #print(str(size))
        word2vec_file = OUTPUT_FOLDER + '2ata' + '_PAD.model'
    w2v_model = Word2Vec(temp_df, min_count = min_count, size = size, workers = workers, window = window, sg = sg)

    w2v_model.save(word2vec_file)
    return w2v_model, word2vec_file

# Train Word2vec model
w2vmodel, word2vec_file = make_word2vec_model(train, padding=True, sg=sg, min_count=min_count, size=size, workers=workers, window=window)

# Padding 
This part adds "pad" end of the vectors in order to make all the vectors in the same length.

In [None]:
max_sen_len = train.stemmed_tokens.map(len).max()

padding_idx = w2vmodel.wv.vocab['pad'].index
#print(padding_idx)
def make_word2vec_vector_cnn(sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        if word not in w2vmodel.wv.vocab:
            padded_X[i] = 0
        else:
            padded_X[i] = w2vmodel.wv.vocab[word].index
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device).view(1, -1)

# CNN Classifier Model Creation

In [None]:
EMBEDDING_SIZE = 500
NUM_FILTERS = 10
import gensim
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: " + str(device))

#torch.nn.Conv2d(in_channels: int, out_channels: int, kernel_size: Union[T, Tuple[T, T]], 
#stride: Union[T, Tuple[T, T]] = 1, padding: Union[T, Tuple[T, T]] = 0, 
#dilation: Union[T, Tuple[T, T]] = 1, groups: int = 1, bias: bool = True, padding_mode: str = 'zeros')

class CnnTextClassifier(nn.Module):
    def __init__(self, vocab_size, num_classes, window_sizes=(1,2,3,5)):
        super(CnnTextClassifier, self).__init__()
        w2vmodel = gensim.models.KeyedVectors.load(OUTPUT_FOLDER + '2ata_PAD.model')
        weights = w2vmodel.wv
        # With pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=w2vmodel.wv.vocab['pad'].index)
        
        # like a python list, it was designed to store any desired number of nn.Module
        self.convs = nn.ModuleList([
                                   nn.Conv2d(1, NUM_FILTERS, [window_size, EMBEDDING_SIZE], padding=(window_size - 1, 0))
                                   for window_size in window_sizes
        ])
    
        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x) # [B, T, E]

        # Apply a convolution + max_pool layer for each window size
        x = torch.unsqueeze(x, 1)
        xs = []
        for conv in self.convs:
            x2 = torch.tanh(conv(x))
            x2 = torch.squeeze(x2, -1)
            x2 = F.max_pool1d(x2, x2.size(2))
            xs.append(x2)
        x = torch.cat(xs, 2)

        # FC
        x = x.view(x.size(0), -1)
        logits = self.fc(x)

        probs = F.softmax(logits, dim = 1)

        return probs

In [None]:
def make_target(label):
    if label == 0:
        return torch.tensor([0], dtype=torch.long, device=device)
    elif label == 1:
        return torch.tensor([1], dtype=torch.long, device=device)

# Model Train

In [None]:
NUM_CLASSES = 2
VOCAB_SIZE = len(w2vmodel.wv.vocab)
print(VOCAB_SIZE)
cnn_model = CnnTextClassifier(vocab_size=VOCAB_SIZE, num_classes=NUM_CLASSES)
cnn_model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.0001)
num_epochs = 10

In [None]:
# Open the file for writing loss
loss_file_name = OUTPUT_FOLDER + '1cnn_class_big_loss_with_padding.csv'
f = open(loss_file_name,'w')
f.write('iter, loss')
f.write('\n')
losses = []

cnn_model.train()
for epoch in range(num_epochs):
    print("Epoch" + str(epoch + 1))
    train_loss = 0
    for index, row in X_train.iterrows():
        # Clearing the accumulated gradients
        cnn_model.zero_grad()

        # Make the bag of words vector for stemmed tokens 
        bow_vec = make_word2vec_vector_cnn(row['stemmed_tokens'])
       
        # Forward pass to get output
        probs = cnn_model(bow_vec)

        # Get the target label
        #print(Y_train['label'][index])
        target = make_target(Y_train['label'][index])

        # Calculate Loss: softmax --> cross entropy loss
        loss = loss_function(probs, target)
        train_loss += loss.item()

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

    print(f'train_loss : {train_loss / len(X_train)}')
    print("Epoch ran :"+ str(epoch+1))
    
    f.write(str((epoch+1)) + "," + str(train_loss / len(X_train)))
    f.write('\n')
    train_loss = 0

torch.save(cnn_model, OUTPUT_FOLDER + 'cnn_big_model_500_with_padding.pth')

f.close()
print("Input vector")
#print(bow_vec.cpu().numpy())
print("Probs")
print(probs)
print(torch.argmax(probs, dim=1).cpu().numpy()[0])

# Model Test

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
bow_cnn_predictions = []
original_lables_cnn_bow = []
cnn_model.eval()
loss_df = pd.read_csv(OUTPUT_FOLDER + '1cnn_class_big_loss_with_padding.csv')
print("atataa")
print(loss_df.columns)
# loss_df.plot('loss')

y_pred_list = []
y_true_list = []

with torch.no_grad():
    for index, row in X_test.iterrows():
        #print(row['stemmed_tokens'])
        bow_vec = make_word2vec_vector_cnn(row['stemmed_tokens'])
        #print(bow_vec)
        probs = cnn_model(bow_vec)
        #print(probs.data)
        _, predicted = torch.max(probs.data,  1)
        
        bow_cnn_predictions.append(predicted.cpu().numpy()[0])
        original_lables_cnn_bow.append(make_target(Y_test['label'][index]).cpu().numpy()[0])

print(confusion_matrix(original_lables_cnn_bow, bow_cnn_predictions))
#print(original_lables_cnn_bow)
print(classification_report(original_lables_cnn_bow,bow_cnn_predictions))
loss_file_name = OUTPUT_FOLDER + '1cnn_class_big_loss_with_padding.csv'
loss_df = pd.read_csv(loss_file_name)
print(loss_df.columns)
plt_500_padding_30_epochs = loss_df[' loss'].plot()
fig = plt_500_padding_30_epochs.get_figure()
fig.savefig(OUTPUT_FOLDER + '1loss_plt_500_padding_30_epochs.pdf')