<a href="https://colab.research.google.com/github/abdulmoiz99/Keras-Text-Classification/blob/main/Keras_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd

filepath_dict = { 'yelp': '/content/sample_data/data/yelp_labelled.txt',
                  'amazon': '/content/sample_data/data/amazon_cells_labelled.txt',
                  'imdb': '/content/sample_data/data/imdb_labelled.txt'}

df_list = []

for source, filepath in filepath_dict.items():
  df = pd.read_csv(filepath, names = ['sentence', 'label'], sep ='\t')
  df['source'] = source
  df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])  

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [10]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences,y,test_size = 0.25, random_state = 1000
)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)


In [12]:
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [13]:
X_test

<250x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 2069 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

score = classifier.score(X_test,y_test)

print("Accuracy: ",score)

Accuracy:  0.796


In [18]:
for source in df['source'].unique():
  df_source = df[df['source'] == source]
  sentences = df_source['sentence'].values
  y = df_source['label'].values

  sentences_train, sentences_test, y_train, y_test = train_test_split(
      sentences, y, test_size = 0.25, random_state = 10000
  )

  vectorizer = CountVectorizer()
  vectorizer.fit(sentences_train)
  vectorizer.fit(sentences_test)
  X_train = vectorizer.transform(sentences_train)
  X_test = vectorizer.transform(sentences_test)


  classifier = LogisticRegression()
  classifier.fit(X_train, y_train)
  score = classifier.score(X_test,y_test)
  print('Accuracy for {} data: {:,.4f}'.format(source,score))

Accuracy for yelp data: 0.8000
Accuracy for amazon data: 0.8040
Accuracy for imdb data: 0.7701


In [19]:
!pip install keras



In [22]:
from keras.models import Sequential
from keras import layers

input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation ='relu'))
model.add(layers.Dense(1,activation='sigmoid'))


In [23]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                11990     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 12,001
Trainable params: 12,001
Non-trainable params: 0
_________________________________________________________________


In [30]:
history = model.fit(X_train, y_train,
                    epochs = 20,
                    verbose = False,
                    validation_data=(X_test,y_test),
                    batch_size = 10 
                    )

In [29]:
from keras.backend import clear_session
clear_session()

In [32]:
loss, accuracy = model.evaluate(X_train,y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test,y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy: 0.7487


TypeError: ignored

In [33]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

What Is a Word Embedding?

In [46]:
cities = ['London', 'Berlin', 'Berlin', 'New York', 'Canada']

cities

['London', 'Berlin', 'Berlin', 'New York', 'Canada']

In [47]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
city_labels = encoder.fit_transform(cities)

city_labels

array([2, 0, 0, 3, 1])

In [48]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse = False)
city_labels =city_labels.reshape((5,1))

encoder.fit_transform(city_labels)

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [50]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(sentences_train)


X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)


vocab_size = len(tokenizer.word_index) + 1

print(sentences_train[2])
print(X_train[2])



The acting is terrible, and the writing is worse.  
[1, 35, 4, 136, 2, 1, 158, 4, 159]
