Data Preprocessing


In [None]:
#Data Cleaning



import pandas as pd
import numpy as np

# قراءة البيانات من ملف CSV او اى نوع ملف اخر مع تغيير read_csv الى read_excel او read_json حسب نوع الملف
data=pd.read_csv("file path")
df = pd.DataFrame(data)

# التعامل مع القيم المفقودة
# حذف الصفوف الناقصة
df_drop = df.dropna()



# تعويض القيم المفقودة بالمتوسط
df_fill_mean = df.fillna(df.mean(numeric_only=True))



# تعويض القيم المفقودة بالقيمة الأكثر تكراراً
df_fill_mode = df.fillna(df.mode().iloc[0])



# التعامل مع القيم الشاذة (Outliers) باستخدام IQR 
Q1 = df['coulmn name'].quantile(0.25)
Q3 = df['coulmn name'].quantile(0.75)
IQR = Q3 - Q1
df_outliers = df[(df['coulmn name'] >= Q1 - 1.5*IQR) & (df['coulmn name'] <= Q3 + 1.5*IQR)]



# إزالة البيانات المكررة 
df_unique = df.drop_duplicates()



# after that we have cleaned data

Data Transformation

In [None]:


from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

#Normalization
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df.select_dtypes(include=[np.number])), columns=df.select_dtypes(include=[np.number]).columns)


# standardization

scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df.select_dtypes(include=[np.number])), columns=df.select_dtypes(include=[np.number]).columns)

# تحويل البيانات النصيه الى قيم رقمية

# Label Encoding
label_encoder = LabelEncoder()
df['encoded_column'] = label_encoder.fit_transform(df['categorical_column'])

# One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_features = onehot_encoder.fit_transform(df[['categorical_column']])
df_onehot = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(['categorical_column']))
df = pd.concat([df, df_onehot], axis=1).drop('categorical_column', axis=1)

# تقسيم البيانات إلى ميزات وهدف (Features and Target)
X = df.drop('target_column', axis=1)
y = df['target_column']


# الآن البيانات جاهزة للنمذجة



Data Visualization

In [None]:
#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram
plt.figure(figsize=(10,6))
sns.histplot(df['numerical_column'], bins=30, kde=True)
plt.title('Distribution of Numerical Column')
plt.xlabel('Numerical Column')
plt.ylabel('Frequency')
plt.show()

# Box Plot
plt.figure(figsize=(10,6))
sns.boxplot(x='categorical_column', y='numerical_column', data=df)
plt.title('Box Plot of Numerical Column by Categorical Column')
plt.xlabel('Categorical Column')
plt.ylabel('Numerical Column')
plt.show()

# Scatter Plot
plt.figure(figsize=(10,6))
sns.scatterplot(x='numerical_column1', y='numerical_column2', hue='categorical_column', data=df)
plt.title('Scatter Plot of Numerical Column1 vs Numerical Column2')
plt.xlabel('Numerical Column1')
plt.ylabel('Numerical Column2')
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12,8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



Now we have excellent data and we can model it.

If we are making a classification

In [None]:
#classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#  importing  classification models

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# تقسيم البيانات إلى مجموعة تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Random Forest Classifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("LogisticRegression")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("SVC")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("KNeighborsClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("DecisionTreeClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("AdaBoostClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("GradientBoostingClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("XGBClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



model = CatBoostClassifier(verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("CatBoostClassifier")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))






If we are making a regression

In [None]:

#regression
from sklearn.model_selection import train_test_split    
from sklearn.metrics import mean_squared_error, r2_score
#  importing  regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.tree import DecisionTreeRegressore
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
# تقسيم البيانات إلى مجموعة تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Linear Regression")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))



model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Random Forest Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Gradient Boosting Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = AdaBoostRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("AdaBoost Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = DecisionTreeRegressore()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Decision Tree Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

model = KNeighborsRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("KNeighbors Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = SVR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Support Vector Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("XGB Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


model = CatBoostRegressor(verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("CatBoost Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))






In [None]:
#MODEL SAVING 
import joblib
joblib.dump(model, "model.pkl")
loaded_model = joblib.load("model.pkl")


when have un supervised

Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="viridis")
plt.title("PCA - 2D Projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# t-SNE (للتصور فقط)
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap="plasma")
plt.title("t-SNE Visualization")
plt.xlabel("Dim1")
plt.ylabel("Dim2")
plt.show()


Clustering

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# KMeans Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
print("KMeans Silhouette Score:", silhouette_score(X, clusters))

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X)
print("DBSCAN Labels:", set(clusters))  # -1 = noise

# Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=3)
clusters = agg.fit_predict(X)
print("AgglomerativeClustering Silhouette Score:", silhouette_score(X, clusters))
