In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib

In [3]:
# 1. Load Data
train_df = pd.read_csv(r"C:\Users\HP\Downloads\xy_train.csv")
test_df = pd.read_csv(r"C:\Users\HP\Downloads\x_test.csv")

In [5]:
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())


Train columns: ['ID', 'text', 'label']
Test columns: ['ID', 'text']


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re

# Load the datasets
train_df = pd.read_csv(r"C:\Users\HP\Downloads\xy_train.csv")
test_df = pd.read_csv(r"C:\Users\HP\Downloads\x_test.csv")

# Check columns to ensure correct text column name
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

# Preprocessing function
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove non-alphabetic
    text = text.lower().split()
    tokens = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(tokens)

# Replace 'text' with actual column name if needed
train_df['clean_text'] = train_df['text'].astype(str).apply(clean_text)
test_df['clean_text'] = test_df['text'].astype(str).apply(clean_text)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(train_df['clean_text'])
X_test_vec = tfidf.transform(test_df['clean_text'])

# Labels
y_train = train_df['label']  # Replace with correct column name if different

# Train classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predict
predictions = model.predict(X_test_vec)
print(predictions)


Train columns: ['ID', 'text', 'label']
Test columns: ['ID', 'text']
[1 0 1 ... 0 0 0]


In [11]:
# Check column names
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())


Train columns: ['ID', 'text', 'label', 'clean_text']
Test columns: ['ID', 'text', 'clean_text']


In [13]:
train_df['clean_text'] = train_df['text'].astype(str).apply(clean_text)
test_df['clean_text'] = test_df['text'].astype(str).apply(clean_text)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(train_df['clean_text'])
X_test_vec = tfidf.transform(test_df['clean_text'])

# Labels
y_train = train_df['label']

# Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Output sample (since test_df has no labels)
print("Predictions on test data:", y_pred[:10])


Predictions on test data: [1 0 1 1 0 1 0 1 0 0]


In [17]:
# 2. Preprocessing
def clean_text(text):
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)



In [19]:
train_df['clean_text'] = train_df['text'].astype(str).apply(clean_text)
test_df['clean_text'] = test_df['text'].astype(str).apply(clean_text)

In [21]:
# 3. Vectorization
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_vec = tfidf.fit_transform(train_df['clean_text'])
X_test_vec = tfidf.transform(test_df['clean_text'])

In [23]:
# 4. Model Training
y_train = train_df['label']
model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train_vec, y_train)

In [25]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Get predicted probabilities for all classes
y_pred_proba = model.predict_proba(X_train_vec)

# Compute multiclass ROC-AUC score
print("ROC-AUC Score:", roc_auc_score(y_train, y_pred_proba, multi_class='ovr'))  # or 'ovo'

# Classification metrics
print("Classification Report:\n", classification_report(y_train, model.predict(X_train_vec)))
print("Confusion Matrix:\n", confusion_matrix(y_train, model.predict(X_train_vec)))


ROC-AUC Score: 0.9360992631433159
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85     25798
           1       0.82      0.83      0.82     22015
           2       0.88      0.04      0.07       187

    accuracy                           0.83     48000
   macro avg       0.85      0.57      0.58     48000
weighted avg       0.83      0.83      0.83     48000

Confusion Matrix:
 [[21729  4068     1]
 [ 3696 18319     0]
 [  134    46     7]]


In [27]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Get probability matrix (n_samples x n_classes)
y_pred_proba = model.predict_proba(X_train_vec)

# Compute multiclass ROC-AUC score using One-vs-Rest strategy
print("ROC-AUC Score:", roc_auc_score(y_train, y_pred_proba, multi_class='ovr'))

# Print classification metrics
print("Classification Report:\n", classification_report(y_train, model.predict(X_train_vec)))
print("Confusion Matrix:\n", confusion_matrix(y_train, model.predict(X_train_vec)))


ROC-AUC Score: 0.9360992631433159
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85     25798
           1       0.82      0.83      0.82     22015
           2       0.88      0.04      0.07       187

    accuracy                           0.83     48000
   macro avg       0.85      0.57      0.58     48000
weighted avg       0.83      0.83      0.83     48000

Confusion Matrix:
 [[21729  4068     1]
 [ 3696 18319     0]
 [  134    46     7]]


In [29]:
print(y_train.value_counts())



label
0    25798
1    22015
2      187
Name: count, dtype: int64


In [31]:
print("ROC-AUC Score:", roc_auc_score(y_train, model.predict_proba(X_train_vec), multi_class='ovr', average='macro'))


ROC-AUC Score: 0.9360992631433159


In [33]:
print(y_train.unique())  # Check unique values in the label column


[1 0 2]


In [35]:
y_train = (y_train == 1).astype(int)  # Convert to binary (0 vs 1)


In [37]:
# 5. Evaluation on Training Data
y_pred_proba = model.predict_proba(X_train_vec)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_train, y_pred_proba))
print("Classification Report:\n", classification_report(y_train, model.predict(X_train_vec)))
print("Confusion Matrix:\n", confusion_matrix(y_train, model.predict(X_train_vec)))

ROC-AUC Score: 0.9194044153864864
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85     25985
           1       0.82      0.83      0.82     22015
           2       0.00      0.00      0.00         0

    accuracy                           0.84     48000
   macro avg       0.56      0.56      0.56     48000
weighted avg       0.84      0.84      0.84     48000

Confusion Matrix:
 [[21863  4114     8]
 [ 3696 18319     0]
 [    0     0     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
# 6. Save Model & Vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [41]:
# 7. Final Predictions on Test Data
test_pred_proba = model.predict_proba(X_test_vec)[:, 1]
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'label': test_pred_proba
})
submission.to_csv('final_predictions.csv', index=False)
