In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import nltk

In [2]:
# Load data from CSV file with specified encoding
df = pd.read_csv(r'D:\anas\business_guide_details_all_pages2.csv', encoding='utf-8-sig')  # Replace with the correct encoding

In [3]:
# Combine title and content for better context
df['Combined_Text'] = df['Title'] + ' ' + df['Content']


In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
        df['Combined_Text'], df['Class'], test_size=0.2, random_state=42
    )

In [5]:
nltk.download('stopwords')  # Download Arabic stop words (one-time download)
stop_words = nltk.corpus.stopwords.words('arabic')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tilika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [7]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

In [8]:
# Evaluate the model
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# Calculate confusion matrix and F1 score for the test dataset
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the confusion matrix and F1 score for the test dataset
print("Confusion Matrix:")
print(conf_matrix)
print("\nF1 Score:", f1)



Accuracy: 0.5217391304347826
Confusion Matrix:
[[ 0  0  4  0  0]
 [ 0  0  2  0  0]
 [ 0  0 12  0  0]
 [ 0  0  1  0  0]
 [ 0  0  4  0  0]]

F1 Score: 0.35776397515527947


In [9]:
# Train a Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)



In [11]:
# Evaluate the model
y_pred = rf_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# Calculate confusion matrix and F1 score for the test dataset
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the confusion matrix and F1 score for the test dataset
print("Confusion Matrix:")
print(conf_matrix)
print("\nF1 Score:", f1)

print(y_test)


Accuracy: 0.5652173913043478
Confusion Matrix:
[[ 0  0  4  0  0]
 [ 0  0  2  0  0]
 [ 0  0 12  0  0]
 [ 0  0  1  0  0]
 [ 0  0  3  0  1]]

F1 Score: 0.43785166240409207
78     خدمات لوجستية (تعبئة وتغليف، نقل وتخزين وتوصيل)
10                                  التسويق الإلكتروني
4                                           حلول تقنية
84                                  التسويق الإلكتروني
64                                    تنظيمية وقانونية
68                                          حلول تقنية
30                                          حلول تقنية
45                                    تنظيمية وقانونية
96                                          حلول تقنية
11                                          حلول تقنية
79     خدمات لوجستية (تعبئة وتغليف، نقل وتخزين وتوصيل)
80     خدمات لوجستية (تعبئة وتغليف، نقل وتخزين وتوصيل)
0                                           حلول تقنية
81                                        حلول تمويلية
18                                          حلول تقنية
70    