In [3]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import numpy as np
import re
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from joblib import load, dump

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Влад\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Влад\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def check_metrics(history, figure_name):
      train=history.history[f'{figure_name}']
      val = history.history[f'val_{figure_name}']

      epochs = range(1, len(train)+1)
      plt.plot(epochs, train, 'b', label = f'Training {figure_name}')
      plt.plot(epochs, val, 'b', label = f'Validation {figure_name}', color = 'green')
      plt.title(f'Training and Validation {figure_name}')
      plt.xlabel('Epochs')
      plt.ylabel(f'{figure_name}')
      plt.legend()
      plt.show()

In [4]:
test=tf.keras.utils.text_dataset_from_directory(r"D:\aclImdb\test")
train=tf.keras.utils.text_dataset_from_directory(r"D:\aclImdb\train")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [5]:
def extract_rating(filename):
  parts = filename.split('_')
  rating = int(parts[1].split('.')[0])
  return rating

def create_dataframe(folder_path):
  data = []
  for i, subfolder in enumerate(os.listdir(folder_path)):
    subfolder_path = os.path.join(folder_path, subfolder)
    if os.path.isdir(subfolder_path):
      for filename in os.listdir(subfolder_path):
        if filename.endswith(".txt"):
          filepath = os.path.join(subfolder_path, filename)
          with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
          rating = extract_rating(filename)
          data.append({'text': text, 'rating': rating, 'label': i})  
  return pd.DataFrame(data)

folder_path = r"D:\aclImdb\test"
folder_path_2=r"D:\aclImdb\train"
test_df = create_dataframe(folder_path)
train_df=create_dataframe(folder_path_2)

print(test_df)

                                                    text  rating  label
0      Once again Mr. Costner has dragged out a movie...       2      0
1      This is an example of why the majority of acti...       4      0
2      First of all I hate those moronic rappers, who...       1      0
3      Not even the Beatles could write songs everyon...       3      0
4      Brass pictures (movies is not a fitting word f...       3      0
...                                                  ...     ...    ...
24995  I was extraordinarily impressed by this film. ...       8      1
24996  Although I'm not a golf fan, I attended a snea...      10      1
24997  From the start of "The Edge Of Love", the view...       8      1
24998  This movie, with all its complexity and subtle...      10      1
24999  I've seen this story before but my kids haven'...       7      1

[25000 rows x 3 columns]


In [6]:
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac = 1)
train_df

Unnamed: 0,text,rating,label
13947,"Oh, those sneaky Italians. It's not the first ...",8,1
22827,I finally got myself set up on mail order DVD ...,8,1
6261,"with a title like this, you know not to expect...",3,0
4151,"Take ""Rambo,"" mix in some ""Miami Vice,"" slice ...",1,0
13797,Culled from the real life exploits of Chuck Co...,8,1
...,...,...,...
6196,"Some amusing humor, some that falls flat, some...",4,0
7386,"SPOILERS All too often, Hollywood's Shakespear...",1,0
10571,"She has been catapulted from 13 to 30, with ma...",4,0
6609,Mad Magazine may have a lot of crazy people wo...,2,0


In [7]:
def data_processing(text):
      text = text.lower()
      text = re.sub('<br />', '', text)
      text = re.sub(r"https\S+www\S+https\S+", '', text, flags = re.MULTILINE)
      text = re.sub(r'\@w+|\#', '', text)
      text = re.sub(r'^[\w\s]', '', text)
      text_tokens = word_tokenize(text)
      filtered_text = [w for w in text_tokens if not w in stop_words]
      return " ".join(filtered_text)

In [8]:
train_df.text = train_df['text'].apply(data_processing)
test_df.text = test_df['text'].apply(data_processing)
train_df['text']

13947    h , sneaky italians . 's first time based movi...
22827    finally got set mail order dvd rental could fi...
6261     ith title like , know expect great horror movi...
4151     ake `` rambo , '' mix `` miami vice , '' slice...
13797    ulled real life exploits chuck connors steve b...
                               ...                        
6196     ome amusing humor , falls flat , decent acting...
7386     poilers often , hollywood 's shakespeare adapt...
10571    catapulted 13 30 , magic dust involved , court...
6609     ad magazine may lot crazy people working ... o...
22413    movie `` holly '' story young girl sold poor f...
Name: text, Length: 25000, dtype: object

In [9]:
duplicated_count = train_df.duplicated().sum()
duplicated_count

98

In [10]:
duplicated_count = test_df.duplicated().sum()
duplicated_count

197

In [11]:
train_df = train_df.drop_duplicates('text')
test_df = test_df.drop_duplicates('text')

In [12]:
stemmer = PorterStemmer()
def stemming(data):
      text = [stemmer.stem(word) for word in data]
      return data


In [13]:
train_df.text = train_df['text'].apply(lambda x: stemming(x))
#test_df.text = test_df['text'].apply(lambda x: stemming(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.text = train_df['text'].apply(lambda x: stemming(x))


In [14]:
X_train = train_df['text']
X_test = test_df['text']
Y_train = train_df['label']
Y_test = test_df['label']
Z_train = train_df['rating']
Z_test = test_df['rating']

In [49]:
vect = TfidfVectorizer()
X_train = vect.fit_transform(train_df['text'])
X_test = vect.transform(test_df['text'])

In [50]:
X_train = X_train[:25000]
X_test =X_test[:25000]
Y_train = Y_train[:25000]
Y_test = Y_test[:25000]
Z_train = Z_train[:25000]
Z_test = Z_test[:25000]

In [51]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [57]:
optimazer = Adam(learning_rate=0.001)

In [58]:
model = Sequential()
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
model.compile(optimizer=optimazer,  loss='mse',  metrics=['mse'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)
history = model.fit(x = X_train, y = Z_train, epochs = 15, batch_size=16, validation_data= (X_test, Z_test ))
check_metrics(history, 'mse')
check_metrics(history, 'loss')

Epoch 1/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 89ms/step - loss: 14.9511 - mse: 14.9511 - val_loss: 4.9908 - val_mse: 4.9908
Epoch 2/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 64ms/step - loss: 3.8466 - mse: 3.8466 - val_loss: 5.2319 - val_mse: 5.2319
Epoch 3/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - loss: 2.6704 - mse: 2.6704 - val_loss: 5.3450 - val_mse: 5.3450
Epoch 4/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 61ms/step - loss: 2.1421 - mse: 2.1421 - val_loss: 5.2137 - val_mse: 5.2137
Epoch 5/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 60ms/step - loss: 1.7735 - mse: 1.7735 - val_loss: 5.2863 - val_mse: 5.2863
Epoch 6/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 61ms/step - loss: 1.5426 - mse: 1.5426 - val_loss: 5.4452 - val_mse: 5.4452
Epoch 7/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s

In [None]:
dump(model, 'model.joblib')
dump(vect, 'tfidf_vectorizer.joblib')

In [65]:
model2 = Sequential()
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(optimizer='rmsprop',  loss='binary_crossentropy',  metrics=['accuracy'])
history2 = model2.fit(x = X_train, y = Y_train, epochs = 10, batch_size= 128, validation_data= (X_test, Y_test ))
check_metrics(history2, 'accuracy')
check_metrics(history2, 'loss')


Epoch 1/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 98ms/step - accuracy: 0.6676 - loss: 0.6716 - val_accuracy: 0.8320 - val_loss: 0.5817
Epoch 2/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8913 - loss: 0.5233 - val_accuracy: 0.8607 - val_loss: 0.4438
Epoch 3/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9179 - loss: 0.3606 - val_accuracy: 0.8791 - val_loss: 0.3482
Epoch 4/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9395 - loss: 0.2448 - val_accuracy: 0.8776 - val_loss: 0.3078
Epoch 5/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9559 - loss: 0.1703 - val_accuracy: 0.8784 - val_loss: 0.2885
Epoch 6/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9722 - loss: 0.1174 - val_accuracy: 0.8777 - val_loss: 0.2912
Epoch 7/10
[1m98/98[0m [32m━━━