In [1]:
import pandas as pd

# Specify the path to your dataset
file_path = 'training.1600000.processed.noemoticon.csv'
import dask.dataframe as dd


# List of encodings to try
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'utf-16']
column_names = ["sentiment", "ids", "date", "flag", "user", "text"]

# Iterate over each encoding and try reading the file
for encoding in encodings_to_try:
    try:

        # Read the dataset using Dask with the current encoding
        df = dd.read_csv(file_path, encoding=encoding,names=column_names,blocksize=None)

        # Compute and display the first few rows
        print(df.head())

        # If no exception is raised, break the loop
        break
    except UnicodeDecodeError:
        # If an exception is raised, continue to the next encoding
        continue





   sentiment         ids                          date      flag  \
0          0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1          0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2          0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3          0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4          0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [2]:
print(df.tail())

         sentiment         ids                          date      flag  \
1002697          4  1960186342  Fri May 29 07:33:44 PDT 2009  NO_QUERY   
1002698          4  1960186409  Fri May 29 07:33:43 PDT 2009  NO_QUERY   
1002699          4  1960186429  Fri May 29 07:33:44 PDT 2009  NO_QUERY   
1002700          4  1960186445  Fri May 29 07:33:44 PDT 2009  NO_QUERY   
1002701          4  1960186607  Fri May 29 07:33:45 PDT 2009  NO_QUERY   

                    user                                               text  
1002697  Madelinedugganx           My GrandMa is making Dinenr with my Mum   
1002698     OffRoad_Dude  Mid-morning snack time... A bowl of cheese noo...  
1002699         Falchion  @ShaDeLa same here  say it like from the Termi...  
1002700   jonasobsessedx             @DestinyHope92 im great thaanks  wbuu?  
1002701        sugababez               cant wait til her date this weekend   


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

X = df["text"]
y = df["sentiment"]
# Tokenize the text data
max_words = 10000  # Adjust this value as needed
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [4]:
# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [5]:
# Pad sequences to ensure uniform length
max_sequence_length = 100  # Adjust this value as needed
X_pad = pad_sequences(X_seq, maxlen=max_sequence_length)

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [9]:
# Define the CNN model
embedding_dim = 100  # Adjust this value as needed
num_filters = 128  # Adjust this value as needed
kernel_size = 5
dropout_rate = 0.5

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=100))
model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))

In [10]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 64
epochs = 5  # Adjust this value as needed
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.42047885060310364
Test Accuracy: 0.8445854187011719


In [11]:
# Define a simpler CNN model with reduced complexity
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))  # Reduced number of filters
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))  # Reduced number of neurons
model.add(Dense(1, activation='sigmoid'))

# Compile the simplified model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the simplified model
batch_size = 64
epochs = 5
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the simplified model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.4017302095890045
Test Accuracy: 0.8493824005126953


In [12]:
from tensorflow.keras.layers import MaxPooling1D, Flatten

# Define the second CNN model architecture
model_simple = Sequential()
model_simple.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_simple.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_simple.add(MaxPooling1D(pool_size=2))  # Adding a MaxPooling layer
model_simple.add(Flatten())  # Flatten the output of the convolutional layer
model_simple.add(Dense(64, activation='relu'))
model_simple.add(Dense(1, activation='sigmoid'))

# Compile the second model
model_simple.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the second model
history_simple = model_simple.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the second model on the test set
loss_simple, accuracy_simple = model_simple.evaluate(X_test, y_test)
print("Test Loss (Simple Model):", loss_simple)
print("Test Accuracy (Simple Model):", accuracy_simple)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss (Simple Model): 0.41225582361221313
Test Accuracy (Simple Model): 0.8464603424072266


In [13]:
# Define the third CNN model architecture
model_complex = Sequential()
model_complex.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_complex.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_complex.add(MaxPooling1D(pool_size=2))
model_complex.add(Conv1D(filters=128, kernel_size=3, activation='relu'))  # Adding another convolutional layer
model_complex.add(MaxPooling1D(pool_size=2))
model_complex.add(Conv1D(filters=256, kernel_size=3, activation='relu'))  # Adding another convolutional layer
model_complex.add(GlobalMaxPooling1D())  # Using GlobalMaxPooling instead of Flatten
model_complex.add(Dense(128, activation='relu'))
model_complex.add(Dense(1, activation='sigmoid'))

# Compile the third model
model_complex.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the third model
history_complex = model_complex.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the third model on the test set
loss_complex, accuracy_complex = model_complex.evaluate(X_test, y_test)
print("Test Loss (Complex Model):", loss_complex)
print("Test Accuracy (Complex Model):", accuracy_complex)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss (Complex Model): 0.44182559847831726
Test Accuracy (Complex Model): 0.8141726851463318
