In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-dataset/training.1600000.processed.noemoticon.csv
/kaggle/input/sentiment-analysis-dataset/train.csv
/kaggle/input/sentiment-analysis-dataset/testdata.manual.2009.06.14.csv
/kaggle/input/sentiment-analysis-dataset/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from mpl_toolkits.mplot3d import Axes3D
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from textblob import TextBlob

# Deep Convolutional Neural Network for Sentiment Analysis

In [7]:
# load text
# load text
filename = '/kaggle/input/sentiment-analysis-dataset/train.csv'
with open(filename, 'rt', encoding='ISO-8859-1') as file:
    text = file.read()

In [24]:
import string
from nltk.corpus import stopwords
import nltk

# Assuming NLTK stopwords are already downloaded
nltk.download('stopwords')

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


For csv file

In [26]:
import csv
from collections import Counter

# Function to add words from a document to the vocabulary
def add_doc_to_vocab(text, vocab):
    words = text.split()
    vocab.update(words)

# Function to process CSV files
def process_csv(filename, vocab, is_train):
    with open(filename, 'r', encoding='ISO-8859-1') as file:
        reader = csv.reader(file)
        # Skip header if present
        next(reader, None)
        for row in reader:
            text = row[1]  # Assuming the text is in the second column, adjust as necessary
            if is_train and row[0] == 'cv9':
                continue
            if not is_train and row[0] != 'cv9':
                continue
            add_doc_to_vocab(text, vocab)

# Define vocab
vocab = Counter()
# Add all docs to vocab
process_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', vocab, is_train=True)
# Print the size of the vocab
print("Size of the vocabulary:", len(vocab))
# Print the top words in the vocab
print("Top 50 words in the vocabulary:", vocab.most_common(50))

Size of the vocabulary: 52270
Top 50 words in the vocabulary: [('to', 9809), ('I', 8802), ('the', 8388), ('a', 6501), ('my', 4932), ('and', 4677), ('i', 4263), ('you', 3808), ('is', 3670), ('for', 3575), ('in', 3554), ('of', 3091), ('it', 3024), ('on', 2623), ('have', 2377), ('that', 2150), ('me', 2140), ('so', 2134), ('with', 2011), ('be', 1923), ('but', 1918), ('at', 1741), ('was', 1731), ('just', 1719), ('I`m', 1696), ('not', 1516), ('get', 1373), ('all', 1351), ('this', 1290), ('are', 1278), ('out', 1265), ('like', 1258), ('day', 1165), ('-', 1147), ('up', 1147), ('go', 1101), ('your', 1084), ('good', 1032), ('got', 961), ('from', 958), ('do', 934), ('going', 911), ('no', 904), ('now', 903), ('love', 880), ('work', 837), ('****', 796), ('will', 794), ('about', 785), ('one', 775)]


In [27]:
# keep tokens with a min occurrence
min_occurrence = 2
tokens = [k for k, c in vocab.items() if c >= min_occurrence]
print(len(tokens))

14860


> Train Embedding Layer

In [31]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.layers import Conv1D, MaxPooling1D

In [32]:
# Function to process CSV files
def process_csv(filename, vocab, is_train):
    df = pd.read_csv(filename)
    documents = []
    for index, row in df.iterrows():
        text = row['text']  # Assuming the text is in a column named 'text', adjust as necessary
        if is_train and row['label'] == 'cv9':
            continue
        if not is_train and row['label'] != 'cv9':
            continue
        documents.append(text)
        add_doc_to_vocab(text, vocab)
    return documents

In [38]:
def process_csv(filename, vocab, is_train):
    try:
        df = pd.read_csv(filename, encoding='ISO-8859-1')  # Try a different encoding
    except UnicodeDecodeError:
        df = pd.read_csv(filename, encoding='latin1')  # Try another encoding if the first one fails
    documents = []
    for index, row in df.iterrows():
        text = row['text']  # Assuming the text is in a column named 'text', adjust as necessary
        textID = row['textID']  # Assuming the textID is in a column named 'textID'
        # Check if text is not NaN
        if pd.notnull(text):
            if is_train and textID == 'cv9':
                continue
            if not is_train and textID != 'cv9':
                continue
            documents.append(text)
            add_doc_to_vocab(text, vocab)
    return documents

In [39]:
# load all training reviews
positive_docs = process_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', vocab, True)

In [42]:
import pandas as pd

# Read the CSV file into df with a different encoding
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Now you can use df
ytrain = np.array([0 if label == 'negative' else 1 for label in df['textID']])

In [43]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(positive_docs)

In [44]:
# sequence encode
encoded_docs_train = tokenizer.texts_to_sequences(positive_docs)

In [45]:
# pad sequences
max_length = max([len(s.split()) for s in positive_docs])
Xtrain = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')

In [48]:
# load all test reviews
negative_docs = process_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', vocab, False)

In [49]:
import pandas as pd

# Read the CSV file into df with a different encoding
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Now you can use df
ytrain = np.array([0 if label == 'negative' else 1 for label in df['textID']])

In [50]:
# sequence encode
encoded_docs_test = tokenizer.texts_to_sequences(negative_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

In [51]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [62]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define preprocess_text function
def preprocess_text(text):
    # Convert text to lowercase if it's a string
    if isinstance(text, str):
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Read the CSV file
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Apply preprocess_text function to each element of 'text' column
X_train = df['text'].apply(preprocess_text)
y_train = df['textID']  # Assuming 'textID' is the column name for the target labels

vocab_size = 10000  
max_length = 100    

In [74]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming 'data' is the DataFrame loaded from the CSV
le = LabelEncoder()
df['target'] = le.fit_transform(df['text'])

# Convert text column to strings
df['text'] = df['text'].astype(str)

# Convert text to sequences and pad sequences
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=8))  # Removed input_length parameter
model.add(Flatten())
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, df['target'], epochs=10, batch_size=32, validation_split=0.2)

# Summarize the model
print(model.summary())

Epoch 1/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.8864e-05 - loss: -51905144.0000 - val_accuracy: 0.0000e+00 - val_loss: -517450784.0000
Epoch 2/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 5.0901e-05 - loss: -827775872.0000 - val_accuracy: 0.0000e+00 - val_loss: -1999462912.0000
Epoch 3/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 4.9116e-05 - loss: -2514158592.0000 - val_accuracy: 0.0000e+00 - val_loss: -4243735296.0000
Epoch 4/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.3868e-04 - loss: -4947546112.0000 - val_accuracy: 0.0000e+00 - val_loss: -7128005632.0000
Epoch 5/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 1.1330e-05 - loss: -7943320064.0000 - val_accuracy: 0.0000e+00 - val_loss: -10585164800.0000
Epoch 6/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━

None


In [75]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train: (27481,)
Shape of y_train: (27481,)


In [83]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Read the CSV file
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Drop rows with NaN values in the 'text' column
df = df.dropna(subset=['text'])

# Convert non-string values to string
df['text'] = df['text'].astype(str)

# Tokenize and pad sequences for X_train
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
X_train_seq = tokenizer.texts_to_sequences(df['text'])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')

In [87]:
# Assuming you have defined vocab_size and max_length
vocab_size = 10000
max_length = 100

# Assuming you have already split your data into training and testing sets
# Convert text to sequences and pad sequences for X_test
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_test)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [92]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target labels
y_test_encoded = label_encoder.fit_transform(y_test)

# Now, evaluate the model with encoded labels
loss, acc = model.evaluate(X_test_padded, y_test_encoded, verbose=0)
print('Test Accuracy: %.2f%%' % (acc * 100))

Test Accuracy: 37.06%


> Train word2vec Embedding

In [94]:
import pandas as pd

# Assuming you have a CSV file with 'text' and 'sentiment' columns
df_pos = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')
df_neg = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='ISO-8859-1')

# Concatenate the positive and negative dataframes
df = pd.concat([df_pos, df_neg], ignore_index=True)

# Assuming 'text' column contains the text data and 'sentiment' column contains the labels
sentences = df['text'].tolist()
labels = df['sentiment'].tolist()

print('Total training sentences:', len(sentences))

Total training sentences: 32296


In [101]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

# Define preprocess_text function if not defined
def preprocess_text(text):
    # Implement your preprocessing steps here
    # For example, lowercasing and removing punctuation
    if pd.isnull(text):  # Check if the value is NaN
        return ''  # Replace NaN with an empty string
    text = text.lower()
    text = text.strip()  # Remove leading and trailing whitespaces
    # Additional preprocessing steps can be added here
    return text

# Read the CSV file containing the text data
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Apply preprocess_text function to each element of 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Tokenize text into sentences
sentences = [nltk.word_tokenize(text) for text in df['processed_text']]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, workers=8, min_count=1)

# Retrieve vocabulary from the model
words = list(model.wv.key_to_index.keys())
print('Vocabulary size:', len(words))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Vocabulary size: 30708


Sentiment Analysis

In [107]:
# Sentiment Column

import pandas as pd
from textblob import TextBlob

# Read the CSV file
data = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Handle NaN values if necessary
data.dropna(subset=['sentiment'], inplace=True)

# Calculating sentiment polarity for each review
data['Sentiment'] = data['sentiment'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Displaying the DataFrame with the sentiment column
print(data)

           textID                                               text  \
0      cb774db0d1                I`d have responded, if I were going   
1      549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2      088c60f138                          my boss is bullying me...   
3      9642c003ef                     what interview! leave me alone   
4      358bd9e861   Sons of ****, why couldn`t they put them on t...   
...           ...                                                ...   
27476  4eac33d1c0   wish we could come see u on Denver  husband l...   
27477  4f4c4fc327   I`ve wondered about rake to.  The client has ...   
27478  f67aae2310   Yay good for both of you. Enjoy the break - y...   
27479  ed167662a5                         But it was worth it  ****.   
27480  6f7127d9d7     All this flirting going on - The ATG smiles...   

                                           selected_text sentiment  \
0                    I`d have responded, if I were going   neutra

In Sentiment column, sentiment analysis score showed that more negative statements . It mean people sad,angry and frustration in all situation recorded.

In [108]:
# Text Column

import pandas as pd
from textblob import TextBlob

# Read the CSV file
data = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')

# Handle NaN values if necessary
data.dropna(subset=['text'], inplace=True)

# Calculating sentiment polarity for each review
data['Sentiment'] = data['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Displaying the DataFrame with the sentiment column
print(data)

           textID                                               text  \
0      cb774db0d1                I`d have responded, if I were going   
1      549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2      088c60f138                          my boss is bullying me...   
3      9642c003ef                     what interview! leave me alone   
4      358bd9e861   Sons of ****, why couldn`t they put them on t...   
...           ...                                                ...   
27476  4eac33d1c0   wish we could come see u on Denver  husband l...   
27477  4f4c4fc327   I`ve wondered about rake to.  The client has ...   
27478  f67aae2310   Yay good for both of you. Enjoy the break - y...   
27479  ed167662a5                         But it was worth it  ****.   
27480  6f7127d9d7     All this flirting going on - The ATG smiles...   

                                           selected_text sentiment  \
0                    I`d have responded, if I were going   neutra

In text column,sentiment analysis score are more positive statement like people feel statisfied and happy