In [3]:
import pandas as pd

# Load the dataset
file_path = 'test.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info()
data.head()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     3534 non-null   object
 1   sentiment  3534 non-null   object
dtypes: object(2)
memory usage: 55.3+ KB


Unnamed: 0,review,sentiment
count,3534,3534
unique,3534,3
top,http://twitpic.com/4woj2 - omgssh ang cute n...,neutral
freq,1,1430


In [4]:
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r"[^a-z\s]", '', text)
    # Remove extra spaces
    text = re.sub(r"\s+", ' ', text).strip()
    return text

# Apply preprocessing to the 'review' column
data['cleaned_review'] = data['review'].apply(preprocess_text)

# Encode sentiment labels
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

# Split the data into training and testing sets
X = data['cleaned_review']
y = data['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display a few preprocessed examples
data[['review', 'cleaned_review', 'sentiment', 'sentiment_encoded']].head()


Unnamed: 0,review,cleaned_review,sentiment,sentiment_encoded
0,Last session of the day http://twitpic.com/67ezh,last session of the day,neutral,1
1,Shanghai is also really exciting (precisely -...,shanghai is also really exciting precisely sky...,positive,2
2,"Recession hit Veronique Branquinho, she has to...",recession hit veronique branquinho she has to ...,negative,0
3,happy bday!,happy bday,positive,2
4,http://twitpic.com/4w75p - I like it!!,i like it,positive,2


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Vectorize the text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Dictionary to store model results
model_results = {}

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tfidf, y_train)
log_reg_preds = log_reg.predict(X_test_tfidf)
model_results['Logistic Regression'] = accuracy_score(y_test, log_reg_preds)

# Support Vector Machine (SVM)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_tfidf, y_train)
svm_preds = svm.predict(X_test_tfidf)
model_results['SVM'] = accuracy_score(y_test, svm_preds)

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
rf_preds = rf.predict(X_test_tfidf)
model_results['Random Forest'] = accuracy_score(y_test, rf_preds)

# Display results
model_results, classification_report(y_test, rf_preds, target_names=label_encoder.classes_)


({'Logistic Regression': 0.6478076379066479,
  'SVM': 0.6492220650636492,
  'Random Forest': 0.6534653465346535},
 '              precision    recall  f1-score   support\n\n    negative       0.75      0.45      0.56       200\n     neutral       0.56      0.81      0.66       286\n    positive       0.81      0.63      0.71       221\n\n    accuracy                           0.65       707\n   macro avg       0.71      0.63      0.65       707\nweighted avg       0.69      0.65      0.65       707\n')