<a href="https://colab.research.google.com/github/ahmedthami/Ai/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
import random
import numpy as np
# print(os.listdir(os.path.join(os.getcwd() , 'gdrive/MyDrive/aclImdb')))


def load_imdb_sentiment_analysis_dataset(data_path= os.path.join(os.getcwd(), 'gdrive/MyDrive/aclImdb'), seed=123):
    """Loads the IMDb movie reviews sentiment analysis dataset.

    # Arguments
        data_path: string, path to the data directory.
        seed: int, seed for randomizer.

    # Returns
        A tuple of training and validation data.
        Number of training samples: 25000
        Number of test samples: 25000
        Number of categories: 2 (0 - negative, 1 - positive)

    # References
        Mass et al., http://www.aclweb.org/anthology/P11-1015

        Download and uncompress archive from:
        http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    """
    imdb_data_path = data_path

    # Load the training data
    train_texts = []
    train_labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(imdb_data_path, 'train', category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    train_texts.append(f.read())
                train_labels.append(0 if category == 'neg' else 1)

    # Load the validation data.
    test_texts = []
    test_labels = []
    for category in ['pos', 'neg']:
        test_path = os.path.join(imdb_data_path, 'test', category)
        for fname in sorted(os.listdir(test_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    test_texts.append(f.read())
                test_labels.append(0 if category == 'neg' else 1)

    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(train_texts)
    random.seed(seed)
    random.shuffle(train_labels)

    return ((train_texts, np.array(train_labels)),
            (test_texts, np.array(test_labels)))


data = load_imdb_sentiment_analysis_dataset()
(x_train, y_train), (x_test, y_test) = data


Mounted at /content/gdrive


In [3]:
import matplotlib.pyplot as plt

def get_number_of_words(sample_text):
  #return median of words per sample
  words = [len(s.split()) for s in sample_text]
  return np.median(words)


def rep_graph(sample_text):
  plt.hist([len(s) for s in sample_text], 50)
  plt.xlabel("number or samples")
  plt.ylabel("len of words")
  plt.title("distribution")
  plt.show()


print(len(x_test))


0


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif


#bag of words approach

NGRAM_RANGE = (1,2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOC_FREQ = 2

def vectorize_texts(train_data, train_labels, validation_data):
  #using tfIDTokenizer

  kwargs = {
      'anaylser' : TOKEN_MODE,
      'ngram_range': NGRAM_RANGE,
      'min_df': MIN_DOC_FREQ,
      'dtype': 'int32',
      'decoder': 'replace',
      'strip_accents': 'unicode'

  }

  vectorizer = TfidfVectorizer(**kwargs)

  #learn vocabulary and return idf-tf document matrix
  x_train = vectorizer.fit_transform(train_data)
  # return document based matrix
  x_validation = vectorizer.transform(validation_data)

  #select top K-features

  #selector takes two arrays X and Y , 
  #SelectKBest(score_func = default ==> f_classif )
  selector = SelectKBest(score_func= 'f_classif', k= min(TOP_K, x_train.shape[1]))
  selector.fit(x_train, train_labels)
  x_val = selector.transform(x_validation).astaype('float32')
  x_train = selector.transform(x_train).astype('float32')


  return x_train, x_val



In [16]:
from tensorflow.python.keras.engine.sequential import Sequential
#defining the model

from tensorflow.keras import models
from tensorflow.keras.activations import softmax, relu
from tensorflow.keras.layers import Dense, Dropout

def get_last_layers_params(num_classes):
  if num_classes > 2:
    act_function = 'softmax'
    units = num_classes
  else:
    act_function = 'sigmoid'
    units = 1
  return units , act_function

def MyModel(layers, all_units , classes_num, input_shap , dropout_rate):
  last_units , act_function = get_last_layers_params(classes_num)
  model = Sequential()
  model.add(Dropout(rate = dropout_rate, input_shape = input_shap))
  for _ in range(layers-1):
    model.add(Dense(units = all_units , activation= 'relu', kernel_initializer= "he_uniform"))
    model.add(Dropout(rate = dropout_rate))
  model.add(Dense(last_units, activation= act_function))
  return model

In [None]:
def training_model(data, input_shape, epochs = 12, batch_size = 200, num_classes = 2, all_units = 64, dropout_rate = 0.2, layers = 2, learning_rate= 1e-3):

  (x_train, train_label), (x_validation, val_labels) = data
  train_data , val_data = vectorize_texts(x_train, train_label, x_validation)
  #evaluation?
  model = MyModel(layers, all_units, num_classes, input_shape, dropout_rate)
  opt = tensorflow.keras.optimizer.adam(lr = learning_rate)
  model.compile(optimizer= opt,loss= 'binary_crossentropy', metrics = ['accuracy'])
  history = model.fit(train_data, train_label,batch_size = batch_size,validation_data = (val_data, val_labels) ,epochs = epochs, verbose = 0)


  

