In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_csv(r'/content/combined_csv_file.csv')
pd.DataFrame(df)
print(df.head())

In [None]:
number_of_rows = df.shape[0]  
print(number_of_rows)

# EDA
 let's explore the data

In [None]:
print(df.info())


In [None]:
print(df.isnull().sum())


In [None]:
print(df['sentiment'].value_counts())


In [None]:
df['text length'] = df['sentiment_reasoning'].astype(str).apply(len)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [None]:
g = sns.FacetGrid(df,col='sentiment')
g.map(plt.hist,'text length')

In [None]:
sns.countplot(x='sentiment',data=df,palette='rainbow')

In [None]:
mixed_value = 'mixed'  
df = df[df['sentiment'] != mixed_value]

In [None]:

df.dropna(inplace=True)


In [None]:
sns.countplot(x='sentiment',data=df,palette='rainbow')

In [None]:
import re

def clean_text(text):
    # Check if the text is a string before applying lower()
    if isinstance(text, str):
        text = text.lower()  # تحويل النصوص لحروف صغيرة
        text = re.sub(r'\s+', ' ', text)  # إزالة المسافات الزائدة
        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # إزالة الروابط
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # إزالة الرموز غير المهمة
    # If not a string (e.g., NaN), return an empty string or handle it as needed
    else:
        text = ""  # or any other appropriate handling for non-string values
    return text

df['cleaned_summary'] = df['sentiment_reasoning'].apply(clean_text)

In [None]:
df.head()

In [None]:
# prompt: stacked bar for the text length based on sentiment label

import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and it has 'sentiment' and 'text length' columns
sentiment_counts = df.groupby('sentiment')['text length'].count().reset_index()

plt.figure(figsize=(10, 6))
sentiment_counts.plot(kind='bar', x='sentiment', y='text length')
plt.title('Text Length Distribution by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Number of Texts')
plt.show()


In [None]:
# @title text length

from matplotlib import pyplot as plt
df['text length'].plot(kind='hist', bins=20, title='text length')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:

# تقسيم البيانات إلى تدريب واختبار
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_summary'], df['sentiment'], test_size=0.2, random_state=42
)

print("عدد العينات في التدريب:", len(train_texts))
print("عدد العينات في الاختبار:", len(test_texts))

In [None]:
!pip install datasets

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from datasets import Dataset as HFDataset
from tqdm import tqdm

In [None]:
# 🔹 2. تجهيز النصوص ودمج العنوان مع الملخص
df["text"] = df['source_title'] + " " + df["cleaned_summary"]

# 🔹 3. تحويل المشاعر إلى أرقام (Labels)
label_map = {"neutral": 0, "positive": 1, "negative": 2}
df["label"] = df["sentiment"].map(label_map)

# 🔹 4. تحميل Tokenizer الخاص بـ FinBERT
model_name = "yiyanghkust/finbert-tone"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🔹 5. تحويل النصوص إلى Tokens

encodings = tokenizer(df["text"].tolist(), truncation=True, padding=True, max_length=512)



In [None]:
# 🔹 6. إنشاء Dataset من بياناتك الخاصة
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])

train_dataset = NewsDataset(encodings, df["label"].tolist())

In [None]:

# 🔹 7. تجهيز DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
# 🔹 8. تحميل FinBERT Model (Pretrained) مع 3 تصنيفات
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# 🔹 9. تحديد الجهاز (GPU إذا متوفر)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# 🔹 10. تحديد Optimizer والـ Loss Function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

# 🔹 11. تدريب النموذج 🚀
epochs = 3  # عدد الدورات التدريبية
model.train()

for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader)}")


In [None]:

# 🔹 12. حفظ النموذج المدرب
model.save_pretrained("finbert_finetuned")
tokenizer.save_pretrained("finbert_finetuned")
print("✅ النموذج تم حفظه بنجاح!")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# 🔹 1. تحميل النموذج المدرب
model_path = "finbert_finetuned"  # تأكد أنه نفس المسار الذي حفظت فيه النموذج
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 🔹 2. تجهيز الجهاز (GPU إن وجد)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # تفعيل وضع التقييم

# 🔹 3. قائمة الأخبار الجديدة لاختبار النموذج
new_headlines = [
    "Tesla shares rise after record-breaking quarterly earnings",
    "Apple stock falls due to global supply chain issues",
    "Microsoft announces major acquisition in AI sector"
]

# 🔹 4. تحويل الأخبار إلى Tokens
encodings = tokenizer(new_headlines, truncation=True, padding=True, max_length=512, return_tensors="pt").to(device)

# 🔹 5. تمرير البيانات إلى النموذج والحصول على التنبؤات
with torch.no_grad():
    outputs = model(**encodings)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)  # تحويل القيم إلى احتمالات

# 🔹 6. تحويل النتائج إلى تصنيفات مفهومة
label_map = {0: "Neutral", 1: "Positive", 2: "Negative"}
predicted_labels = [label_map[np.argmax(pred.cpu().numpy())] for pred in predictions]

# 🔹 7. طباعة النتائج
for headline, sentiment in zip(new_headlines, predicted_labels):
    print(f"📰 الخبر: {headline}\n📊 التصنيف: {sentiment}\n")
