In [None]:
import pandas as pd

train_data = pd.read_csv("/content/train.csv")


In [None]:
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', str(text))

    # Lowercase conversion
    text = text.lower()

    # Tokenization and cleaning
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]

    # Join words back into one string separated by space
    text = ' '.join(tokens)

    return text

# Handling missing data: drop rows where 'text' is NaN
data_cleaned = train_data.dropna(subset=['text'])

# Apply preprocessing to the 'text' column
data_cleaned['text_processed'] = data_cleaned['text'].apply(preprocess_text)

# Show the changes to the data
data_cleaned[['text', 'text_processed']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['text_processed'] = data_cleaned['text'].apply(preprocess_text)


Unnamed: 0,text,text_processed
0,"I`d have responded, if I were going",responded going
1,Sooo SAD I will miss you here in San Diego!!!,sooo sad miss san diego
2,my boss is bullying me...,bos bullying
3,what interview! leave me alone,interview leave alone
4,"Sons of ****, why couldn`t they put them on t...",son put release already bought


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize the text to fit Word2Vec
tokenized_text = [text.split() for text in data_cleaned['text_processed']]

# Define and train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=2, workers=4)

# Define a function to convert text to an average Word2Vec vector
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

# Apply the function to each document
word2vec_features = np.array([document_vector(doc) for doc in tokenized_text])

# Check the shape of the resulting feature matrix
word2vec_features.shape


(27480, 100)

In [None]:
from sklearn.model_selection import train_test_split

# Features are the word embeddings
X = word2vec_features

# Target is the 'sentiment' column
y = data_cleaned['sentiment']

# Splitting the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the size of each set to confirm splitting
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((21984, 100), (5496, 100), (21984,), (5496,))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Initialize TF-IDF Vectorizer to include unigrams and bigrams
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)

# Fit and transform the processed text data to create TF-IDF features including bigrams
X_bigram = tfidf_vectorizer_bigram.fit_transform(data_cleaned['text_processed'])
y = data_cleaned['sentiment']

# Splitting the data into training and validation sets again with the new features
X_train_bigram, X_val_bigram, y_train, y_val = train_test_split(X_bigram, y, test_size=0.2, random_state=42)

# Train a logistic regression model
log_reg_model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence with bigrams
log_reg_model.fit(X_train_bigram, y_train)

# Evaluate the model on the validation set
from sklearn.metrics import accuracy_score, classification_report

y_val_pred = log_reg_model.predict(X_val_bigram)
accuracy_val = accuracy_score(y_val, y_val_pred)
classification_report_val = classification_report(y_val, y_val_pred)

accuracy_val, classification_report_val


(0.6766739446870451,
 '              precision    recall  f1-score   support\n\n    negative       0.70      0.55      0.62      1572\n     neutral       0.62      0.74      0.67      2236\n    positive       0.77      0.71      0.74      1688\n\n    accuracy                           0.68      5496\n   macro avg       0.69      0.67      0.67      5496\nweighted avg       0.68      0.68      0.68      5496\n')

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Types of penalties
    'solver': ['liblinear', 'saga']  # Solvers that support l1 penalty
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='accuracy', cv=5)

# Fit GridSearchCV
grid_search.fit(X_train_bigram, y_train)

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_val_pred_best = best_model.predict(X_val_bigram)
classification_report_val_best = classification_report(y_val, y_val_pred_best, output_dict=True)

# Convert classification report to DataFrame for better readability
import pandas as pd
classification_report_df = pd.DataFrame(classification_report_val_best).transpose()
classification_report_df


Unnamed: 0,precision,recall,f1-score,support
negative,0.719771,0.560433,0.630186,1572.0
neutral,0.634724,0.755367,0.68981,2236.0
positive,0.772812,0.737559,0.754774,1688.0
accuracy,0.694141,0.694141,0.694141,0.694141
macro avg,0.709102,0.684453,0.69159,5496.0
weighted avg,0.701461,0.694141,0.692709,5496.0


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()


# Re-encode labels and split the data properly with encoded labels
y_encoded = label_encoder.fit_transform(data_cleaned['sentiment'])

# Re-splitting the data into training and validation sets using the encoded labels
X_train_bigram, X_val_bigram, y_train_encoded, y_val_encoded = train_test_split(
    X_bigram, y_encoded, test_size=0.2, random_state=42)

# Show the encoded labels and their corresponding classes
label_encoder.classes_, label_encoder.transform(label_encoder.classes_)


(array(['negative', 'neutral', 'positive'], dtype=object), array([0, 1, 2]))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def create_model(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')  # 3 output units for 3 classes
    ])
    return model


In [None]:
def compile_model(model):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.utils import to_categorical

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
X_dense = X_bigram.toarray()  # Convert TF-IDF features to a dense array

for train, test in kfold.split(X_dense, y_one_hot):
    model = create_model(input_dim=X_dense.shape[1])
    model = compile_model(model)

    print('Training fold', fold_no)
    history = model.fit(X_dense[train], y_one_hot[train],
                        epochs=50,
                        validation_data=(X_dense[test], y_one_hot[test]),
                        verbose=0)

    scores = model.evaluate(X_dense[test], y_one_hot[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[1]} of {scores[1]*100}%')
    fold_no += 1


Training fold 1
Score for fold 1: accuracy of 66.3755476474762%
Training fold 2
Score for fold 2: accuracy of 66.59388542175293%
Training fold 3
Score for fold 3: accuracy of 66.64847135543823%
Training fold 4
Score for fold 4: accuracy of 66.81222915649414%
Training fold 5
Score for fold 5: accuracy of 66.08442664146423%


In [None]:
print("Feature shape:", X_dense.shape)
print("Label shape:", y_one_hot.shape)


Feature shape: (27480, 1000)
Label shape: (27480, 3)


In [None]:
import matplotlib.pyplot as plt

# List to store accuracy for each fold
val_accuracies = []
train_accuracies = []

for train, test in kfold.split(X_dense, y_one_hot):
    model = create_model(input_dim=X_dense.shape[1])
    model = compile_model(model)

    print('Training fold', fold_no)
    history = model.fit(X_dense[train], y_one_hot[train],
                        epochs=50,
                        validation_data=(X_dense[test], y_one_hot[test]),
                        verbose=0)

    # Store history
    train_accuracies.append(history.history['accuracy'])
    val_accuracies.append(history.history['val_accuracy'])

    scores = model.evaluate(X_dense[test], y_one_hot[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[1]} of {scores[1]*100}%')
    fold_no += 1


In [None]:
# Calculate the average accuracy per epoch
average_train_accuracy = np.mean(train_accuracies, axis=0)
average_val_accuracy = np.mean(val_accuracies, axis=0)
epochs = range(1, 51)  # Adjust based on your epochs

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(epochs, average_train_accuracy, 'bo-', label='Average Training Accuracy')
plt.plot(epochs, average_val_accuracy, 'ro-', label='Average Validation Accuracy')
plt.title('Training and Validation Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
