In [86]:
pip install tensorflow



In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [88]:
data1 = pd.read_csv("Reddit_Data.csv")
data2 = pd.read_csv("Twitter_Data.csv")

In [89]:
data1.head(5)

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [90]:
data2.head(5)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [91]:
data1.rename(columns={'clean_comment': 'clean_text'}, inplace=True)

In [92]:
# Concatenate the two DataFrames along the rows (axis=0)
data = pd.concat([data1, data2], axis=0)


In [93]:
data

Unnamed: 0,clean_text,category
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [94]:
data.isna().sum()

clean_text    104
category        7
dtype: int64

In [95]:
data = data.dropna()

In [96]:
data.isna().sum()

clean_text    0
category      0
dtype: int64

# Data Cleaning

In [97]:
# Convert text to lowercase for consistency
data['clean_text'] = data['clean_text'].str.lower()

# Remove special characters and punctuation using translate function and translate it to none
data['clean_text'] = data['clean_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenization and removing stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens
# Make changing to original dataframe
data.loc[:, 'tokenized_text'] = data['clean_text'].apply(preprocess_text)

#print(data[['clean_text', 'tokenized_text', 'category']])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text'] = data['clean_text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text'] = data['clean_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

# Feature Extraction

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the tokenized text
tfidf_matrix = tfidf_vectorizer.fit_transform(data['tokenized_text'].apply(lambda x: ' '.join(x)))

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Reset index of the data DataFrame
data.reset_index(drop=True, inplace=True)

# Concatenate the TF-IDF matrix with the original Data
data_tfidf = pd.concat([data, tfidf_df], axis=1)

#print(data_tfidf)



# Model Selection and Evaluation

In [99]:
# Convert tokenized_text back to sentences
data['sentences'] = data['tokenized_text'].apply(lambda tokens: ' '.join(tokens))
model_w2v = Word2Vec(sentences=data['tokenized_text'], vector_size=100, window=5, min_count=1, sg=0)  #using CBOW algo

# Create a word index
word_index = {word: index + 1 for index, word in enumerate(model_w2v.wv.index_to_key)}

# Convert words to corresponding indices
data['indices'] = data['tokenized_text'].apply(lambda tokens: [word_index[word] for word in tokens if word in word_index])

# Pad sequences to a fixed length
max_seq_length = 30

# Generate padded sequences
padded_sequences = pad_sequences(data['indices'], maxlen=max_seq_length, padding='post')

# Create a DataFrame with padded sequences
padded_indices_df = pd.DataFrame(padded_sequences, columns=[f"index_{i}" for i in range(max_seq_length)])

# Concatenate the padded_indices_df with the original DataFrame
data_concatenated = pd.concat([data, padded_indices_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data_concatenated.iloc[:, -max_seq_length:], data_concatenated['category'], test_size=0.2, random_state=42)

# Build and train the LSTM model
embedding_dim = 100
vocab_size = len(word_index) + 1
# Train a neural network with three layers
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))
model_lstm.add(LSTM(100))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.fit(X_train, y_train, epochs=3 , batch_size=60, validation_split=0.2)

# Evaluate the model
loss, accuracy = model_lstm.evaluate(X_test, y_test)
print(f"LSTM Model Metrics:")
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentences'] = data['tokenized_text'].apply(lambda tokens: ' '.join(tokens))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['indices'] = data['tokenized_text'].apply(lambda tokens: [word_index[word] for word in tokens if word in word_index])


Epoch 1/3
Epoch 2/3
Epoch 3/3
LSTM Model Metrics:
Loss: -11.1574
Accuracy: 0.4055


In [100]:
model_lstm.save('sentiment_analysis_model.h5')
# A file is created which is used in API. I didn't upload them because of their larger size

In [101]:
model_w2v.save('word2vec_model.model')
# A file is created which is used in API

In [102]:
import json
# Save the word_index to a JSON file
with open('word_index.json', 'w') as json_file:
    json.dump(word_index, json_file)
    # A file is created which is used in API

In [103]:
print("Shape of loaded Word2Vec model:", model_w2v.wv.vectors.shape)
print("Number of words in loaded word index:", len(word_index))


Shape of loaded Word2Vec model: (134294, 100)
Number of words in loaded word index: 134294


In [104]:
print(model_w2v)

Word2Vec<vocab=134294, vector_size=100, alpha=0.025>
