In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import plot_model
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# spam mail detection

In [2]:
df = pd.read_csv('spam.csv')
data = df.where((pd.notnull(df)),'')
data = data.replace(['spam','ham'],[1,0])

In [3]:
x = data.Message
y = data.Category
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.4, random_state=42 )


In [4]:

ytrain = ytrain.astype('int64')
ytest = ytest.astype('int64')

In [5]:
tclf = Pipeline([('tfidf', TfidfVectorizer()),('clf', SVC())])

In [6]:
tclf.fit(xtrain, ytrain)

In [7]:
predtrain = tclf.predict(xtrain)
accuracy_score(ytrain, predtrain)

0.9982052049057732

In [8]:
prediction = tclf.predict(xtest)
accuracy_score(prediction, ytest)

0.9860924181247196

In [9]:
data2 = "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
data2 = pd.Series(data2)
inp = tclf.predict(data2)[0]
if inp == 0:
    print('Ham')
    
else:
    print('Spam')

Ham


# imdb review sentiment analysis

In [10]:
(ximdbtrain, yimdbtrain), (ximdbtest, yimdbtest) = imdb.load_data(num_words=10000)

In [11]:
max_sequence_length = 500  # Maximum sequence length for padding/truncating
ximdbtrain = pad_sequences(ximdbtrain, maxlen=max_sequence_length)
ximdbtest = pad_sequences(ximdbtest, maxlen=max_sequence_length)

In [12]:
embedding_dim = 32
lstm_units = 100
imdbmodel = Sequential()
imdbmodel.add(Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_sequence_length))
imdbmodel.add(LSTM(units=lstm_units))
imdbmodel.add(Dense(units=1, activation='sigmoid'))
imdbmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
epochs = 3
batch_size = 64
imdbmodel.fit(ximdbtrain, yimdbtrain, epochs=epochs, batch_size=batch_size, validation_data=(ximdbtest, yimdbtest))


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x234ddda1e50>

In [14]:
loss, accuracy = imdbmodel.evaluate(ximdbtest, yimdbtest, verbose=2)
print(f"Test accuracy: {accuracy:.4f}")


782/782 - 56s - loss: 0.3601 - accuracy: 0.8676 - 56s/epoch - 72ms/step
Test accuracy: 0.8676
