In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv("train.csv")

# Explore the dataset
print(df.head())
print(df.info())
print(df['target'].value_counts())

# Data Preprocessing
# Remove special characters, punctuation, and lowercasing
df['text'] = df['text'].str.replace(r'[^\w\s]', '').str.lower()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Feature Engineering
# Convert text data to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Building
# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = nb_classifier.predict(X_test_tfidf)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{classification_rep}")

# You can further fine-tune your model and try other algorithms for better results.

# For predictions on a new tweet:
new_tweet = ["A wildfire has broken out in the forest near the town."]
new_tweet_tfidf = tfidf_vectorizer.transform(new_tweet)
prediction = nb_classifier.predict(new_tweet_tfidf)
print(f"Prediction for new tweet: {prediction[0]}")
