In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import bz2
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
train_file = bz2.BZ2File('train.ft.txt.bz2')
test_file = bz2.BZ2File('test.ft.txt.bz2')

In [11]:
def load_extract(file):
    texts, labels = [], []
    for line in file:
        x = line.decode('utf-8')  # decode binary to string
        labels.append(int(x[9]) - 1)  # extract labels
        texts.append(x[10:].strip())  # extract texts
    print('Done !')
    return np.array(labels), texts

In [12]:
train_labels, train_texts = load_extract(train_file)
test_labels, test_texts = load_extract(test_file)

Done !
Done !


In [13]:
import pandas as pd
import bz2

# Open the bz2 file
with bz2.BZ2File('train.ft.txt.bz2', 'r') as f:
    # Read the lines from the file
    lines = f.readlines()

# Convert lines to strings
lines = [line.decode('utf-8') for line in lines]

# Split each line into label and text
data = [line.split(' ', 1) for line in lines]

# Create a DataFrame
df = pd.DataFrame(data, columns=['label', 'text'])

# Display the DataFrame
print(df.head())


        label                                               text
0  __label__2  Stuning even for the non-gamer: This sound tra...
1  __label__2  The best soundtrack ever to anything.: I'm rea...
2  __label__2  Amazing!: This soundtrack is my favorite music...
3  __label__2  Excellent Soundtrack: I truly like this soundt...
4  __label__2  Remember, Pull Your Jaw Off The Floor After He...


In [14]:
# Display the DataFrame
print(df.tail())

              label                                               text
3599995  __label__1  Don't do it!!: The high chair looks great when...
3599996  __label__1  Looks nice, low functionality: I have used thi...
3599997  __label__1  compact, but hard to clean: We have a small ho...
3599998  __label__1  what is it saying?: not sure what this book is...
3599999  __label__2  Makes My Blood Run Red-White-And-Blue: I agree...


In [15]:
def clean_texts(texts):
    stwords = stopwords.words('english')
    temp_texts = []
    for i in range(len(texts)):
        text = re.sub('\d','0',texts[i]) #replace every digit with 0
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: # remove links and urls
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", text)

        text = re.sub('[^a-zA-Z]', ' ', text) #anything which is not a character replace with whitespace char
        text = text.lower()
        text = text.split()
        text = [word for word in text if not word in stwords] # remove stopwords
        text = ' '.join(text)
        temp_texts.append(text)
    print('--100%--Done !')
    return temp_texts

In [16]:
print('Processing Training data')
train_texts = clean_texts(train_texts)
print('\nProcessing Test data')
test_texts = clean_texts(test_texts)

Processing Training data
--100%--Done !

Processing Test data
--100%--Done !


In [17]:
train_texts[0]

'stuning even non gamer sound track beautiful paints senery mind well would recomend even people hate vid game music played game chrono cross games ever played best music backs away crude keyboarding takes fresher step grate guitars soulful orchestras would impress anyone cares listen'

In [3]:
import numpy as np
import pandas as pd
import bz2
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout

# Function to load and extract labels and texts from the file
def load_extract(file, max_samples=None):
    texts, labels = [], []
    total_samples = 0
    for line in file:
        x = line.decode('utf-8')  # decode binary to string
        labels.append(int(x[9]) - 1)  # extract labels
        texts.append(x[10:].strip())  # extract texts
        total_samples += 1
        # Break loop if maximum number of samples is reached
        if max_samples is not None and total_samples >= max_samples:
            break
    print('Done !')
    return np.array(labels), texts

# Function to clean texts
def clean_texts(texts):
    stwords = stopwords.words('english')
    temp_texts = []
    total_samples = len(texts)
    for i, text in enumerate(texts):
        text = re.sub('\d','0',text) #replace every digit with 0
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: # remove links and urls
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", text)

        text = re.sub('[^a-zA-Z]', ' ', text) #anything which is not a character replace with whitespace char
        text = text.lower()
        text = text.split()
        text = [word for word in text if not word in stwords] # remove stopwords
        text = ' '.join(text)
        temp_texts.append(text)
        # Print progress every 10000 samples
        if (i + 1) % 10000 == 0 or (i + 1) == total_samples:
            print(f"--{((i + 1) / total_samples) * 100:.2f}%--Done !")
    return temp_texts

# Open the bz2 files and load data
max_train_samples = 10000  # Set maximum number of train samples
max_test_samples = 5000  # Set maximum number of test samples
with bz2.BZ2File('train.ft.txt.bz2', 'r') as train_file, bz2.BZ2File('test.ft.txt.bz2', 'r') as test_file:
    train_labels, train_texts = load_extract(train_file, max_samples=max_train_samples)
    test_labels, test_texts = load_extract(test_file, max_samples=max_test_samples)

# Cleaning the texts
train_texts_cleaned = clean_texts(train_texts)
test_texts_cleaned = clean_texts(test_texts)



Done !
Done !
--100.00%--Done !
--100.00%--Done !


In [4]:
# Preprocessing
max_words = 10000  # Max number of words to keep
maxlen = 100  # Max length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts_cleaned)
X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts_cleaned), maxlen=maxlen)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts_cleaned), maxlen=maxlen)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Model architecture
embedding_dim = 100
filters = 128
kernel_size = 5

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training
batch_size = 32
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

# Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8155999779701233
