In [12]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly.express as px 
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
import os

base_path = r"C:\Users\Ayman\Downloads\archive (90)\bbc"

data = []

for category in os.listdir(base_path):
    category_path = os.path.join(base_path, category)
    if os.path.isdir(category_path):
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            with open(file_path, 'r', encoding='latin1') as f:
                content = f.read()
                data.append((category, content))

In [5]:
df=pd.DataFrame(data)
df.columns=['category','Text']

In [6]:
df=df[['Text','category']]
df

Unnamed: 0,Text,category
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,Be careful how you code\n\nA new European dire...,tech
2223,US cyber security chief resigns\n\nThe man mak...,tech


In [19]:
df.to_csv('news_classification.csv')

In [7]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    words = nltk.word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

df["clean_text"] = df["Text"].apply(clean_text)


In [22]:
df

Unnamed: 0,Text,category,clean_text
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pro...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit highes...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embattl...
3,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit ba profit british airway b...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk drin...
...,...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech,bt program beat dialler scam bt introducing tw...
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech,spam email tempt net shopper computer user acr...
2222,Be careful how you code\n\nA new European dire...,tech,careful code new european directive could put ...
2223,US cyber security chief resigns\n\nThe man mak...,tech,u cyber security chief resigns man making sure...


In [8]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(df["Text"])

# Store each row's vector as a list
df["embedding"] = list(tfidf_matrix.toarray())

In [9]:
df.head()

Unnamed: 0,Text,category,clean_text,embedding
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pro...,"[0.0, 0.020867638057080935, 0.0, 0.0, 0.0, 0.0..."
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit highes...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embattl...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit ba profit british airway b...,"[0.0, 0.018988980429721043, 0.0, 0.0, 0.0, 0.0..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk drin...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:


def log_model(df, test_size):
    X = np.array(df['embedding'].to_list())
    y = df['category']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, shuffle=True, random_state=1, test_size=test_size
    )

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return model

In [34]:
from sklearn.naive_bayes import MultinomialNB

def nb_model(df, test_size):
    X = np.array(df['embedding'].to_list())
    y = df['category']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, shuffle=True, random_state=1, test_size=test_size
    )
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return model

In [33]:
model_Log=log_model(df,0.2)

Accuracy: 0.97

Classification Report:
               precision    recall  f1-score   support

     business       0.93      0.97      0.95        86
entertainment       0.99      0.96      0.98        84
     politics       0.98      0.97      0.97        92
        sport       0.99      0.99      0.99        99
         tech       0.98      0.98      0.98        84

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445


Confusion Matrix:
[[83  0  1  1  1]
 [ 2 81  1  0  0]
 [ 2  0 89  0  1]
 [ 1  0  0 98  0]
 [ 1  1  0  0 82]]


In [35]:
model_nb=nb_model(df,0.2)

Accuracy: 0.93

Classification Report:
               precision    recall  f1-score   support

     business       0.87      0.97      0.92        86
entertainment       1.00      0.76      0.86        84
     politics       0.94      0.97      0.95        92
        sport       0.93      0.99      0.96        99
         tech       0.95      0.98      0.96        84

     accuracy                           0.93       445
    macro avg       0.94      0.93      0.93       445
 weighted avg       0.94      0.93      0.93       445


Confusion Matrix:
[[83  0  1  1  1]
 [ 9 64  5  4  2]
 [ 1  0 89  1  1]
 [ 1  0  0 98  0]
 [ 1  0  0  1 82]]


In [36]:
import pickle
## Logisitic regression is better 
with open("model_Log.pkl", "wb") as f:
    pickle.dump(model_Log, f)

print("✅ Model saved as model_Log.pkl")


✅ Model saved as model_Log.pkl


In [37]:
import pickle
## Logisitic regression is better 
with open("vec.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ vectorizer saved as model_Log.pkl")


✅ vectorizer saved as model_Log.pkl


In [10]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(df['category'].unique())
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'],  # هنا تحط العمود النصي
    df['category'], 
    test_size=0.2, 
    random_state=42
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)


In [14]:
from sklearn.preprocessing import LabelEncoder
import torch

label_encoder = LabelEncoder()
train_labels = torch.tensor(label_encoder.fit_transform(train_labels))
val_labels = torch.tensor(label_encoder.transform(val_labels))


In [15]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": self.labels[idx]}

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()




  0%|          | 0/336 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.14901427924633026, 'eval_runtime': 28.1293, 'eval_samples_per_second': 15.82, 'eval_steps_per_second': 0.995, 'epoch': 1.0}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.0944007933139801, 'eval_runtime': 30.464, 'eval_samples_per_second': 14.607, 'eval_steps_per_second': 0.919, 'epoch': 2.0}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.07018481194972992, 'eval_runtime': 30.4005, 'eval_samples_per_second': 14.638, 'eval_steps_per_second': 0.921, 'epoch': 3.0}
{'train_runtime': 1533.7796, 'train_samples_per_second': 3.482, 'train_steps_per_second': 0.219, 'train_loss': 0.27876204536074684, 'epoch': 3.0}


TrainOutput(global_step=336, training_loss=0.27876204536074684, metrics={'train_runtime': 1533.7796, 'train_samples_per_second': 3.482, 'train_steps_per_second': 0.219, 'total_flos': 176853438489600.0, 'train_loss': 0.27876204536074684, 'epoch': 3.0})

In [17]:
from sklearn.metrics import classification_report

predictions = trainer.predict(val_dataset)
y_pred = predictions.predictions.argmax(axis=-1)
y_true = val_labels.numpy()

print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


  0%|          | 0/28 [00:00<?, ?it/s]

               precision    recall  f1-score   support

     business       0.96      0.96      0.96       115
entertainment       1.00      1.00      1.00        72
     politics       0.96      0.99      0.97        76
        sport       1.00      0.99      1.00       102
         tech       0.97      0.97      0.97        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



# Logistic Regression – Classification Report Comparison

## 1. Overall Accuracy
| Model                | Accuracy |
|----------------------|----------|
| Before Tuning        | 0.97     |
| After Tuning         | **0.98** |

---

## 2. Macro Average
| Metric    | Before Tuning | After Tuning |
|-----------|---------------|--------------|
| Precision | 0.97          | **0.98**     |
| Recall    | 0.97          | **0.98**     |
| F1-score  | 0.97          | **0.98**     |

---

## 3. Per-Class Performance

| Class         | Precision (Before) | Recall (Before) | F1-score (Before) | Precision (After) | Recall (After) | F1-score (After) |
|---------------|--------------------|-----------------|-------------------|-------------------|----------------|------------------|
| **Business**        | 0.93               | 0.97            | 0.95              | **0.96**          | **0.96**       | **0.96**         |
| **Entertainment**   | 0.99               | 0.96            | 0.98              | **1.00**          | **1.00**       | **1.00**         |
| **Politics**        | 0.98               | 0.97            | 0.97              | 0.96              | **0.99**       | 0.97             |
| **Sport**           | 0.99               | 0.99            | 0.99              | **1.00**          | 0.99           | **1.00**         |
| **Tech**            | 0.98               | 0.98            | 0.98              | 0.97              | 0.97           | 0.97             |

---

## 4. Key Observations
- **Accuracy improved by 1%** (from 97% to 98%).
- **Entertainment** reached perfect scores (1.00 precision, recall, and F1) after tuning.
- **Business** improved in precision (0.93 → 0.96) but recall slightly decreased.
- **Politics** saw a small drop in precision (0.98 → 0.96) but a notable recall improvement (0.97 → 0.99).
- **Sport** achieved perfect precision after tuning.
- **Tech** slightly decreased across all three metrics but remained high.

---

## 5. Conclusion
Hyperparameter tuning:
- Reduced minor misclassifications.
- Balanced precision and recall for most classes.
- Increased model stability with a small but meaningful accuracy boost.


In [19]:
print('-'*100)

----------------------------------------------------------------------------------------------------


### Potential Bias in Training Data
- Historical or societal biases present in the source texts can be learned and amplified by the model.
- Imbalanced representation of certain categories, groups, or topics may cause skewed predictions.
- Language style, dialect, or cultural context differences may reduce fairness and accuracy for underrepresented groups.

### Risks of Misclassification
- Incorrect predictions could lead to misinformation or poor decision-making in downstream applications.
- Mislabeling sensitive content might cause harm to individuals or communities (e.g., false positives in harmful content detection).
- Over-reliance on automated outputs without human review can propagate errors at scale.

### Mitigation Strategies
- Curate and balance datasets to ensure fair representation across categories and demographics.
- Regularly audit model outputs for bias and accuracy, especially on sensitive or high-impact use cases.
- Implement human-in-the-loop review for critical predictions and enable feedback loops for model improvement.
