In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Madhusowmya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
dataset = pd.read_csv("training.1600000.processed.noemoticon.csv" , encoding= 'ISO-8859-1')

In [5]:
dataset.head()

Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']
dataset.columns = col_names

In [7]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [8]:
dataset.shape

(1048572, 6)

In [9]:
#checking for missing values
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [10]:
# Distribution of tweets
dataset['target'].value_counts()

target
0    799996
4    248576
Name: count, dtype: int64

In [11]:
# Converting 0 to -ve and 4 to +ve
dataset['target'] = dataset['target'].map({0:0 , 4:1})

In [12]:
dataset['target'].value_counts()

target
0    799996
1    248576
Name: count, dtype: int64

In [13]:
# Stemming

stremmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
dataset['text'] = dataset['text'].apply(stemming)

In [15]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,kwesidei whole crew


In [16]:
x = dataset['text']
y = dataset['target']

In [17]:
# splitting the dataset
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

In [18]:
# convert textual data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [19]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6198292 stored elements and shape (838857, 328138)>
  Coords	Values
  (0, 38891)	0.400775143648158
  (0, 14592)	0.36261851892418073
  (0, 310440)	0.23944674945165806
  (0, 213360)	0.33741977246344723
  (0, 106715)	0.1976367684924913
  (0, 78032)	0.39706301209607425
  (0, 238401)	0.5830788261117192
  (1, 310440)	0.16815634344198596
  (1, 234536)	0.48130448708128454
  (1, 317696)	0.20514010986264616
  (1, 271849)	0.24304190125482247
  (1, 94523)	0.43038800147556905
  (1, 122944)	0.4035858443173664
  (1, 198441)	0.2870371557279857
  (1, 8855)	0.2240883171081635
  (1, 68179)	0.2864104053098128
  (1, 224365)	0.27598082539483954
  (2, 308903)	0.3260091681570522
  (2, 215352)	0.4044438546293875
  (2, 277744)	0.3822383090103777
  (2, 35126)	0.39464818697445025
  (2, 121997)	0.19090014442691997
  (2, 291621)	0.2875143207107189
  (2, 55892)	0.20543830173559866
  (2, 277759)	0.5166987795580361
  :	:
  (838853, 37072)	0.3073996032889281

In [20]:
# Training the model
model = LogisticRegression()
model.fit(x_train , y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Testing the model
y_pred = model.predict(x_test)
print(accuracy_score(y_test , y_pred))

0.8331831294852537


In [22]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split() 
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)   
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [23]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive


In [24]:
# Save the model
import pickle
pickle.dump(model , open('model.pkl' , 'wb'))

In [25]:
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Deep learning model
dl_model = Sequential([
    Dense(512, activation='relu', input_shape=(x_train.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

dl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = dl_model.fit(x_train, y_train, epochs=10, batch_size=512,
                       validation_split=0.2, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = dl_model.evaluate(x_test, y_test)
print(f"DL Model Accuracy: {accuracy:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1704s[0m 1s/step - accuracy: 0.8068 - loss: 0.4228 - val_accuracy: 0.8354 - val_loss: 0.3719
Epoch 2/10
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1702s[0m 1s/step - accuracy: 0.8733 - loss: 0.3056 - val_accuracy: 0.8314 - val_loss: 0.3852
Epoch 3/10
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1755s[0m 1s/step - accuracy: 0.9185 - loss: 0.2140 - val_accuracy: 0.8182 - val_loss: 0.4554
Epoch 4/10
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1807s[0m 1s/step - accuracy: 0.9476 - loss: 0.1435 - val_accuracy: 0.8067 - val_loss: 0.5356
[1m6554/6554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 41ms/step - accuracy: 0.8207 - loss: 0.5068
DL Model Accuracy: 0.8217


In [27]:
def predict_sentiment_dl(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [stremmer.stem(word) for word in text if word not in stopwords.words('english')]
    text = [' '.join(text)]
    text_vectorized = vectorizer.transform(text)
    prediction = dl_model.predict(text_vectorized)
    return "Positive" if prediction >= 0.5 else "Negative"

print(predict_sentiment_dl("I love this project"))
print(predict_sentiment_dl("I hate waiting in lines"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Negative


In [34]:
from keras.models import Sequential
from keras.layers import Dense

# Example model (you'll already have yours)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(100,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

# Save the model
model.save("dl_model.h5")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [37]:
# Define a deeper quantum circuit
@qml.qnode(dev)
def quantum_circuit(params, x):
    # Encode classical data into quantum states
    for i in range(n_qubits):
        qml.RY(x[i], wires=i)
    
    # Apply multiple layers of parameterized gates and entanglement
    for layer in range(len(params) // (2 * n_qubits)):
        for i in range(n_qubits):
            qml.RY(params[layer * 2 * n_qubits + i], wires=i)
            qml.RZ(params[layer * 2 * n_qubits + i + n_qubits], wires=i)
        for i in range(n_qubits - 1):
            qml.CNOT(wires=[i, i + 1])
    
    # Measure expectation value
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Update the cost function
def cost(params, X, Y):
    loss = 0
    for x, y in zip(X, Y):
        predictions = quantum_circuit(params, x)
        loss += np.sum((predictions - y) ** 2)
    return loss / len(X)

# Increase the number of layers
num_layers = 3
params = np.random.rand(num_layers * 2 * n_qubits, requires_grad=True)

# Optimize the parameters
opt = qml.AdamOptimizer(stepsize=0.01)
for i in range(200):  # Increase the number of iterations
    params = opt.step(lambda p: cost(p, X_train, Y_train), params)
    if i % 10 == 0:
        print(f"Step {i}: Cost = {cost(params, X_train, Y_train)}")

print("Optimized parameters:", params)




Step 0: Cost = 1.5414867524193339
Step 10: Cost = 1.5414867524193339
Step 20: Cost = 1.5414867524193339
Step 30: Cost = 1.5414867524193339
Step 40: Cost = 1.5414867524193339
Step 50: Cost = 1.5414867524193339
Step 60: Cost = 1.5414867524193339
Step 70: Cost = 1.5414867524193339
Step 80: Cost = 1.5414867524193339
Step 90: Cost = 1.5414867524193339
Step 100: Cost = 1.5414867524193339
Step 110: Cost = 1.5414867524193339
Step 120: Cost = 1.5414867524193339
Step 130: Cost = 1.5414867524193339
Step 140: Cost = 1.5414867524193339
Step 150: Cost = 1.5414867524193339
Step 160: Cost = 1.5414867524193339
Step 170: Cost = 1.5414867524193339
Step 180: Cost = 1.5414867524193339
Step 190: Cost = 1.5414867524193339
Optimized parameters: [0.72609133 0.97585208 0.51630035 0.32295647 0.79518619 0.27083225
 0.43897142 0.07845638 0.02535074 0.96264841 0.83598012 0.69597421]


In [7]:
# --- Imports ---
import pennylane as qml
from pennylane import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# --- Quantum Device Setup ---
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# --- Quantum Circuit ---
@qml.qnode(dev, interface="tf")
def quantum_circuit(inputs, weights):
    for i in range(n_qubits):
        qml.RY(inputs[i], wires=i)
    for i in range(n_qubits):
        qml.RY(weights[i], wires=i)
        qml.RZ(weights[i + n_qubits], wires=i)
    for i in range(n_qubits - 1):
        qml.CNOT(wires=[i, i + 1])
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# --- Custom Quantum Layer ---
class QuantumLayer(tf.keras.layers.Layer):
    def __init__(self, n_qubits):
        super().__init__()
        self.n_qubits = n_qubits
        self.weight_shape = (2 * n_qubits,)
        self._weights = self.add_weight(
            shape=self.weight_shape,
            initializer="random_normal",
            trainable=True,
            dtype=tf.float32
        )

    def call(self, inputs):
        return tf.map_fn(
            lambda x: tf.stack(quantum_circuit(x, self._weights)), 
            tf.cast(inputs, dtype=tf.float64),  # Ensure inputs are cast to float64
            dtype=tf.float64  # Ensure the output dtype matches the quantum circuit
        )

# --- Define the Model ---
model = Sequential([
    tf.keras.Input(shape=(n_qubits,)),
    QuantumLayer(n_qubits),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss="binary_crossentropy", 
              metrics=["accuracy"])

# --- Dummy Data for Demo ---
X_train_dummy = np.random.rand(100, n_qubits).astype(np.float64)
y_train_dummy = np.random.randint(0, 2, size=(100, 1)).astype(np.float64)

# --- Train the Model ---
early_stop = EarlyStopping(monitor="val_loss", patience=3)
history = model.fit(X_train_dummy, y_train_dummy,
                    epochs=10, batch_size=16,
                    validation_split=0.2,
                    callbacks=[early_stop])


Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.4969 - loss: 0.6899 - val_accuracy: 0.3500 - val_loss: 0.6948
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5451 - loss: 0.6896 - val_accuracy: 0.6000 - val_loss: 0.6915
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5851 - loss: 0.6856 - val_accuracy: 0.4500 - val_loss: 0.6955
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.4411 - loss: 0.6930 - val_accuracy: 0.4000 - val_loss: 0.6982
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5750 - loss: 0.6961 - val_accuracy: 0.4000 - val_loss: 0.6999
