In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Filling missing values with an empty string
train_data['reviewText'] = train_data['reviewText'].fillna('')
test_data['reviewText'] = test_data['reviewText'].fillna('')

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(train_data['reviewText'], train_data['sentiment'], test_size=0.2, random_state=42)

# Feature Engineering (Tfidf Vectorizer)
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), sublinear_tf=True, use_idf=True)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Model Training (Random Forest Classifier)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vectorized, y_train)

# Model Evaluation
y_pred_val = classifier.predict(X_val_vectorized)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy (Random Forest):", accuracy_val)

# Model Prediction on Test Set
X_test_vectorized = vectorizer.transform(test_data['reviewText'])
y_pred_test = classifier.predict(X_test_vectorized)

submission_df = pd.DataFrame({'id': test_data['movieid'], 'sentiment': y_pred_test})
submission_df.to_csv("submission.csv", index=False)


Validation Accuracy (Random Forest): 0.7525190464487589
