In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv("train1.csv",encoding="ISO-8859-1")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Drop completely empty rows
df.dropna(how='all', inplace=True)

# Drop rows with missing values in required columns
df.dropna(subset=['text', 'label'], inplace=True)

# Remove duplicates
df.drop_duplicates(subset='text', inplace=True)

# Ensure labels are either 0 or 1
df = df[df['label'].isin([0, 1])]

# Convert label to integer
df['label'] = df['label'].astype(int)

columns = ['text', 'label']
df = df[columns]

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Clean Text
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)              # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()             # Remove extra whitespace
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_text'] = df['text'].apply(clean_text)
df.head()


In [None]:
# Feature Extraction using TF-IDF with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, min_df=5, max_features=10000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']


In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# XGBoost with Hyperparameter Tuning
params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2]
}

grid = GridSearchCV(XGBClassifier(eval_metric='logloss', use_label_encoder=False), params, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
model = grid.best_estimator_


In [None]:
# Model Evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Cross-Validation for Stability
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print("Cross-Validation F1 Scores:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))


In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'model.pkl')

# Save the fitted vectorizer (e.g., TfidfVectorizer)
joblib.dump(vectorizer, 'vectorizer.pkl')
