In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:

# Load datasets
train_df = pd.read_csv(r"C:\Users\ANU\Desktop\My portfolio\KAGGLE\Sentiment analysis on Moview Review\train.tsv\train.tsv", sep='\t')
test_df = pd.read_csv(r"C:\Users\ANU\Desktop\My portfolio\KAGGLE\Sentiment analysis on Moview Review\test.tsv\test.tsv", sep='\t')

# Preprocess text
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

train_df['processed_phrase'] = train_df['Phrase'].apply(preprocess_text)
test_df['processed_phrase'] = test_df['Phrase'].apply(preprocess_text)

# Encode sentiment labels
label_encoder = LabelEncoder()
train_df['sentiment_label'] = label_encoder.fit_transform(train_df['Sentiment'])

# Convert numerical labels back to string labels
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['processed_phrase'], train_df['sentiment_label'], test_size=0.2, random_state=42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_phrase'])

In [4]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [5]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the validation set
y_val_pred = model.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy}')

# Classification report
target_names = [label_mapping[i] for i in range(len(label_mapping))]
print(classification_report(y_val, y_val_pred, target_names=target_names))

Validation Accuracy: 0.620242214532872


TypeError: object of type 'numpy.int64' has no len()

In [7]:
# Predict sentiment for the test set
test_predictions = model.predict(X_test_tfidf)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Prepare the submission
submission_df = pd.DataFrame({
    'PhraseId': test_df['PhraseId'],
    'Sentiment': test_predictions
})

# Save the submission to a CSV file
submission_df.to_csv(r"C:\Users\ANU\Desktop\My portfolio\KAGGLE\Sentiment analysis on Moview Review\submission.csv", index=False)