In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset with the correct encoding
file_path = '/content/training.csv'  # Path to your file in Colab
df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None, names=['Polarity', 'ID', 'Date', 'Query', 'User', 'Text'])

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned_Text'])

# Prepare the target variable
y = df['Polarity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None, names=['Polarity', 'ID', 'Date', 'Query', 'User', 'Text'])
